diff --git a/.travis.yml b/.travis.yml
index 97c60d9..25bb373 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,7 +6,7 @@ python:
 install:
   - pip3 install --upgrade setuptools==41.0.0
   - pip3 install .
-  #- pip3 install -r requirements.txt
+#   - pip3 install -r requirements.txt
 
 # commands to run tes
 # before_script: redis-cli ping
@@ -22,7 +22,8 @@ script:
   - python3.6 ./tests/rbm/test_Logistic_Rule_Regression.py
   - python3.6 ./tests/lime/test_lime.py
   - python3.6 ./tests/shap/test_shap.py
-
+  - python3.6 ./tests/sif/test_SIF.py
+  - python3.6 ./tests/sif/test_SIF.py
 after_success:
 #  - codecov
 
diff --git a/README.md b/README.md
index 478d673..9fe9491 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ We have developed the package with extensibility in mind. This library is still
 - Contrastive Explanations Method with Monotonic Attribute Functions ([Luss et al., 2019](https://arxiv.org/abs/1905.12698))
 - LIME ([Ribeiro et al. 2016](https://arxiv.org/abs/1602.04938),  [Github](https://github.com/marcotcr/lime))
 - SHAP ([Lundberg, et al. 2017](http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions),  [Github](https://github.com/slundberg/shap))
-
+- SIF ([Jianbo, et al. 2021](https://ojs.aaai.org/index.php/AAAI/article/view/17379/17186))
 ### Local direct explanation
 
 - Teaching AI to Explain its Decisions ([Hind et al., 2019](https://doi.org/10.1145/3306618.3314273)) 
diff --git a/aix360/algorithms/sif/SIF.py b/aix360/algorithms/sif/SIF.py
new file mode 100644
index 0000000..73b301a
--- /dev/null
+++ b/aix360/algorithms/sif/SIF.py
@@ -0,0 +1,1186 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import numpy as np
+import pandas as pd
+import copy
+from scipy.optimize import fmin_ncg
+import os.path
+import time
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+from tensorflow.python.ops import array_ops
+from keras import backend as K
+import collections
+from aix360.algorithms.sif.SIF_utils import hessian_vector_product, hessians, operate_deep, operate_deep_2v, normalize_vector
+from abc import ABC, abstractmethod
+from aix360.datasets.SIF_dataset import DataSet
+from scipy.stats import linregress
+from aix360.algorithms.lbbe import LocalBBExplainer
+
+
+class SIFExplainer(LocalBBExplainer):
+    def __init__(self, **kwargs):
+        '''
+        Initialize the SIF neural network
+        '''
+        np.random.seed(0)
+        tf.set_random_seed(0)
+        self.Datasets = collections.namedtuple('Datasets', ['train', 'validation', 'test'])
+        self.batch_size = kwargs.pop('batch_size')
+        self.time_series = kwargs.pop('time_series')
+        self.data_sets = kwargs.pop('data_sets')
+        self.train_dir = kwargs.pop('train_dir', 'output')
+        self.log_dir = kwargs.pop('log_dir', 'log')
+        self.model_name = kwargs.pop('model_name')
+        # self.num_classes = kwargs.pop('num_classes')
+        self.initial_learning_rate = kwargs.pop('initial_learning_rate')
+        self.decay_epochs = kwargs.pop('decay_epochs')
+        self.calc_hessian = kwargs.pop('calc_hessian')
+
+        if 'keep_probs' in kwargs:
+            self.keep_probs = kwargs.pop('keep_probs')
+        else:
+            self.keep_probs = None
+
+        if 'mini_batch' in kwargs:
+            self.mini_batch = kwargs.pop('mini_batch')
+        else:
+            self.mini_batch = True
+
+        if 'damping' in kwargs:
+            self.damping = kwargs.pop('damping')
+        else:
+            self.damping = 0.0
+
+        if not os.path.exists(self.train_dir):
+            os.makedirs(self.train_dir)
+
+        # Initialize session
+        config = tf.compat.v1.ConfigProto()
+        self.sess = tf.compat.v1.Session(config=config)
+
+        # TODO: Remove me after debugging finishes
+        # self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
+        K.set_session(self.sess)
+
+        # Setup input
+        self.input_index_placeholder, self.labels_index_placeholder, self.ts_placeholder = self.placeholder_inputs()
+        self.num_train_examples = self.data_sets.train.labels.shape[0]
+        self.num_test_examples = self.data_sets.test.labels.shape[0]
+
+        if isinstance(self.input_index_placeholder, list) or isinstance(self.input_index_placeholder, tuple):
+            self.input_placeholder = tuple(tf.gather(self.ts_placeholder, x) for x in self.input_index_placeholder)
+        else:
+            self.input_placeholder = tf.gather(self.ts_placeholder, self.input_index_placeholder,
+                                               name='input_placeholder')
+        self.labels_placeholder = tf.reshape(tf.gather(self.ts_placeholder, self.labels_index_placeholder),
+                                             [-1, self.ts_placeholder.shape[1].value], name='label_placeholder')
+
+        # Setup inference and training
+        if self.keep_probs is not None:
+            self.keep_probs_placeholder = tf.placeholder(tf.float32, shape=(2))
+            self.logits = self.inference(self.input_placeholder, keep_probs_placeholder=self.keep_probs_placeholder)
+        elif hasattr(self, 'inference_needs_labels'):
+            self.logits = self.inference(self.input_placeholder, labels_placeholder=self.labels_placeholder)
+        else:
+            self.logits = self.inference(self.input_placeholder)
+
+        self.total_loss, self.loss_no_reg, self.indiv_loss_no_reg = self.loss(
+            self.logits,
+            self.labels_placeholder)
+        self.global_step = tf.Variable(0, name='global_step', trainable=False)
+        self.learning_rate = tf.Variable(self.initial_learning_rate, name='learning_rate', trainable=False)
+        self.learning_rate_placeholder = tf.placeholder(tf.float32)
+        self.update_learning_rate_op = tf.assign(self.learning_rate, self.learning_rate_placeholder)
+        # self.learning_rate = 0.01
+        self.train_op = self.get_train_op(self.total_loss, self.global_step, self.learning_rate)
+        self.train_sgd_op = self.get_train_sgd_op(self.total_loss, self.global_step, self.learning_rate)
+        # self.accuracy_op = self.get_accuracy_op(self.logits, self.labels_placeholder)
+        self.preds = self.predictions(self.logits)
+
+        # Setup misc
+        self.saver = tf.train.Saver()
+
+        # Setup gradients and Hessians
+        self.params = self.get_all_params()
+        # \nabla_\theta L
+        self.grad_total_loss_op = tf.gradients(self.total_loss, self.params)
+        # remove the independent parameters
+        self.params, self.grad_total_loss_op = zip(
+            *[(a, b) for a, b in zip(self.params, self.grad_total_loss_op) if b is not None])
+        # \nabla_\theta L: no regularizer
+        self.grad_loss_no_reg_op = tf.gradients(self.loss_no_reg, self.params)
+        # u,v: a list the same size \theta
+        self.v_placeholder = [tf.placeholder(tf.float32, shape=a.get_shape(), name='v_placeholder') for a in
+                              self.params]
+
+        # H \cdot v
+        self.hessian_vector = hessian_vector_product(self.total_loss, self.params, self.v_placeholder)
+
+        # \nabla_x L
+        self.grad_loss_wrt_input_op = tf.gradients(self.total_loss, self.ts_placeholder)
+        # \nabla_\theta L \cdots v
+        # Because tf.gradients auto accumulates, we probably don't need the add_n (or even reduce_sum)
+        self.influence_op = tf.add_n(
+            [tf.reduce_sum(tf.multiply(a, array_ops.stop_gradient(b)))
+             for a, b in zip(self.grad_total_loss_op, self.v_placeholder)])
+        # (\nabla_x\nabla_\theta L) \codts v: assuming v not depend on x
+        # self.grad_influence_wrt_input_op = tf.gradients(self.influence_op, self.ts_placeholder)
+        self.subgradient = tf.gradients(self.total_loss, self.params[0])
+        if self.calc_hessian:
+            self.hessian = hessians(self.total_loss, self.params)
+
+        # self.grad_loss_wrt_para_input_op = derivative_x1_x2(self.total_loss, self.params,
+        #                                                    self.ts_placeholder)
+
+        self.checkpoint_file = os.path.join(self.train_dir, "%s-checkpoint" % self.model_name)
+
+        self.all_train_feed_dict = self.fill_feed_dict_with_all_ex(self.data_sets.train)
+        self.all_test_feed_dict = self.fill_feed_dict_with_all_ex(self.data_sets.test)
+        # self.optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.01)
+        self.optimizer = tf.compat.v1.train.MomentumOptimizer(0.007, momentum=0.95)
+        # self.optimizer = tf.compat.v1.train.AdamOptimizer(0.01)
+        # self.optimizer = tf.compat.v1.train.AdadeltaOptimizer(0.001)
+        self.t = tf.Variable(tf.random.normal(self.v_placeholder[0].shape), name='t', trainable=True)
+        # self.hessian_vector_1 = hessian_vector_product(self.total_loss, self.params, [self.t])
+        # self.hessian_vector_val = tf.add(self.hessian_vector_1, self.t * self.damping)
+        # fun = 0.5 * tf.matmul(tf.transpose(self.t), self.hessian_vector_val) - tf.matmul(
+        #     tf.transpose(self.v_placeholder[0]), self.t)
+
+        # ---------  test  ----------
+        self.hessian = hessians(self.total_loss, self.params)[0]
+        # self.hessian_vector_val = tf.add(tf.matmul(self.hessian[0], self.t), self.t * self.damping)
+        fun = 0.5 * tf.matmul(tf.transpose(self.t), tf.matmul(self.hessian[0], self.t)) - tf.matmul(
+            tf.transpose(self.v_placeholder[0]), self.t)
+        # ---------  test  ----------
+
+        # self.training_process = tf.compat.v1.train.GradientDescentOptimizer(self.learning_rate).minimize(fun, var_list=[self.t])
+        self.training_process = self.optimizer.minimize(fun, var_list=[self.t])
+        init = tf.global_variables_initializer()
+        self.sess.run(init)
+
+        self.num_train_step = None
+        self.iter_to_switch_to_batch = None
+        self.iter_to_switch_to_sgd = None
+
+        self.vec_to_list = self.get_vec_to_list_fn()
+        self.adversarial_loss, self.indiv_adversarial_loss = self.adversarial_loss(self.logits, self.labels_placeholder)
+        if self.adversarial_loss is not None:
+            self.grad_adversarial_loss_op = tf.gradients(self.adversarial_loss, self.params)
+
+        # newly added configure to speed up the computation
+        self.n_time_stamp = self.time_series.shape[0]
+        self.n_non_nan_y_cont = 0
+        self.non_nan_y_cont_idx = []
+        super().__init__()
+
+    def set_params(self, **kwargs):
+        """
+        Optionally, set parameters for the explainer.
+        """
+        pass
+
+    def explain_instance(self, y_contaminate, index, patchy_k, gammas=None, expectation_rep_time=10, verbose=True,
+                         is_ar=False, short_memo=True):
+        """
+        Explain one or more input instances.
+        """
+        sif = self.get_sif(y_contaminate, index, patchy_k, gammas, expectation_rep_time, verbose, is_ar, short_memo)
+        return sif
+
+    def update_configure(self, y_contaminate, gammas):
+        '''
+        y_contaminate: the contaminating process
+        gammas: gamma in equation 1
+        '''
+        self.n_non_nan_y_cont = self.n_time_stamp - np.isnan(y_contaminate).sum()
+        self.rest_gamma = gammas[np.nonzero(gammas)]
+        self.large_gamma = [x for x in self.rest_gamma if x * self.n_time_stamp >= self.n_non_nan_y_cont]
+        self.rest_gamma = [x for x in self.rest_gamma if x * self.n_time_stamp < self.n_non_nan_y_cont]
+        self.non_nan_y_cont_idx = np.logical_not(np.isnan(y_contaminate))
+
+    def get_vec_to_list_fn(self):
+        params_val = self.sess.run(self.params)
+
+        self.num_params = len(np.concatenate([np.reshape(x, (-1, 1)) for x in params_val]))
+        print('Total number of parameters: %s' % self.num_params)
+
+        def vec_to_list(v):
+            return_list = []
+            cur_pos = 0
+            for p in params_val:
+                return_list.append(np.reshape(v[cur_pos: cur_pos + p.size], p.shape))
+                cur_pos += p.size
+
+            assert cur_pos == len(v)
+            return return_list
+
+        return vec_to_list
+
+    def reset_datasets(self):
+        '''
+        reset the batch
+        '''
+        for data_set in self.data_sets:
+            if data_set is not None:
+                data_set.reset_batch()
+
+    def fill_feed_dict_with_all_ex(self, data_set):
+        '''
+        return a dictionary which contains all examples
+        '''
+        feed_dict = {
+            self.ts_placeholder: self.time_series,
+            self.input_index_placeholder: data_set.x,
+            self.labels_index_placeholder: data_set.labels
+        }
+        return feed_dict
+
+    def fill_feed_dict_with_all_but_one_ex(self, data_set, idx_to_remove):
+        '''
+        return a dictionary which contains all but one example (leave one out)
+        data_set: the dataset
+        idx_to_remove: the index that is not included in the returned dictionary
+        '''
+        num_examples = data_set.num_examples
+        idx = np.array([True] * num_examples, dtype=bool)
+        idx[idx_to_remove] = False
+
+        if isinstance(data_set.x, list):
+            input_x = [x[idx, :] for x in data_set.x]
+        else:
+            input_x = data_set[idx, :]
+
+        feed_dict = {
+            self.ts_placeholder: self.time_series,
+            self.input_index_placeholder: input_x,
+            self.labels_index_placeholder: data_set.labels[idx]
+        }
+        return feed_dict
+
+    def fill_feed_dict_with_batch(self, data_set, batch_size=0):
+        '''
+        return a dictionary whose batch is equal to batch_size
+        data_set: the dataset
+        batch_size: the size of the batch to be returned
+        '''
+        if batch_size is None:
+            return self.fill_feed_dict_with_all_ex(data_set)
+        elif batch_size == 0:
+            batch_size = self.batch_size
+
+        input_feed, labels_feed = data_set.next_batch(batch_size)
+
+        feed_dict = {
+            self.ts_placeholder: self.time_series,
+            self.input_index_placeholder: input_feed,
+            self.labels_index_placeholder: labels_feed
+        }
+        return feed_dict
+
+    def fill_feed_dict_with_some_ex(self, data_set, target_indices, time_seres=None):
+        '''
+        return a dictionary whose batch is equal to batch_size
+        data_set: the dataset
+        target_indices: the indices of examples to be returned
+        '''
+        if isinstance(data_set.x, list):
+            input_feed = [x[target_indices, :].reshape(len(target_indices), -1)
+                          for x in data_set.x]
+        else:
+            input_feed = data_set.x[target_indices, :].reshape(len(target_indices), -1)
+        labels_feed = data_set.labels[target_indices, :].reshape(len(target_indices), -1)
+
+        feed_dict = {
+            self.ts_placeholder: self.time_series if time_seres is None else time_seres,
+            self.input_index_placeholder: input_feed,
+            self.labels_index_placeholder: labels_feed
+        }
+        return feed_dict
+
+    def fill_feed_dict_with_one_ex(self, data_set, target_idx, time_series=None):
+        '''
+        return a dictionary which contains one example
+        data_set: the dataset
+        target_idx: the index of an example to be returned
+        '''
+        if isinstance(data_set.x, list):
+            input_x = [x[target_idx, :].reshape(1, -1) for x in data_set.x]
+        else:
+            input_x = data_set.x[target_idx, :].reshape(1, -1)
+        labels_feed = data_set.labels[target_idx, :].reshape(1, -1)
+        feed_dict = {
+            self.ts_placeholder: self.time_series if time_series is None else time_series,
+            self.input_index_placeholder: input_x,
+            self.labels_index_placeholder: labels_feed
+        }
+        return feed_dict
+
+    def fill_feed_dict_with_one_perturb(self, data_set, target_idx, epsilon):
+        '''
+        return a dictionary which contains one example after perturbation
+        data_set: the dataset
+        target_idx: the index of an example to be returned
+        '''
+        if target_idx not in data_set.labels:
+            if isinstance(data_set.x, (list, tuple)):
+                assert any([target_idx in x for x in data_set.x]), "The index must be included in the dataset"
+            else:
+                assert target_idx in data_set.x, "The index must be included in the dataset"
+
+        new_ts = np.copy(self.time_series)
+        new_ts[target_idx] += epsilon
+        feed_dict = {
+            self.ts_placeholder: new_ts,
+            self.input_index_placeholder: data_set.x,
+            self.labels_index_placeholder: data_set.labels
+        }
+        return feed_dict
+
+    def fill_feed_dict_manual(self, X, Y):
+        '''
+        return a dictionary with user-defined examples
+        X: the input data
+        Y: the label of input data
+        '''
+        X = np.array(X)
+        Y = np.array(Y)
+        input_feed = X.reshape(len(Y), -1)
+        labels_feed = Y.reshape(-1)
+        feed_dict = {
+            self.ts_placeholder: self.time_series,
+            self.input_index_placeholder: input_feed,
+            self.labels_index_placeholder: labels_feed
+        }
+        return feed_dict
+
+    def minibatch_mean_eval(self, ops, data_set):
+
+        num_examples = data_set.num_examples
+        # TODO: Let's think about this
+        # assert num_examples % self.batch_size == 0
+        num_iter = int(num_examples / self.batch_size)
+        self.reset_datasets()
+        ret = []
+        for i in xrange(num_iter):
+            feed_dict = self.fill_feed_dict_with_batch(data_set)
+            ret_temp = self.sess.run(ops, feed_dict=feed_dict)
+
+            if len(ret) == 0:
+                for b in ret_temp:
+                    if isinstance(b, list) or isinstance(b, tuple):
+                        ret.append([c / float(num_iter) for c in b])
+                    else:
+                        ret.append([b / float(num_iter)])
+            else:
+                for counter, b in enumerate(ret_temp):
+                    if isinstance(b, list) or isinstance(b, tuple):
+                        ret[counter] = [a + (c / float(num_iter)) for (a, c) in zip(ret[counter], b)]
+                    else:
+                        ret[counter] += (b / float(num_iter))
+
+        return ret
+
+    def get_train_y_hat(self):
+        '''
+        evaluate the y_hat with training examples
+        '''
+        return self.sess.run(self.preds, feed_dict=self.all_train_feed_dict)
+
+    def get_test_y_hat(self):
+        '''
+        evaluate the y_hat with test examples
+        '''
+        return self.sess.run(self.preds, feed_dict=self.all_test_feed_dict)
+
+    def print_model_eval(self):
+        '''
+        print the evaluation of the model related information, such as loss, gradients, norm of parameters, etc.
+        '''
+        params_val = self.sess.run(self.params)
+
+        if self.mini_batch == True:
+            grad_loss_val, loss_no_reg_val, loss_val = self.minibatch_mean_eval(
+                [self.grad_total_loss_op, self.loss_no_reg, self.total_loss],
+                self.data_sets.train)
+
+            test_loss_val = self.minibatch_mean_eval(
+                [self.loss_no_reg],
+                self.data_sets.test)
+
+        else:
+            grad_loss_val, loss_no_reg_val, loss_val = self.sess.run(
+                [self.grad_total_loss_op, self.loss_no_reg, self.total_loss],
+                feed_dict=self.all_train_feed_dict)
+
+            test_loss_val = self.sess.run(
+                [self.loss_no_reg],
+                feed_dict=self.all_test_feed_dict)
+
+        print('Train loss (w reg) on all data: %s' % loss_val)
+        print('Train loss (w/o reg) on all data: %s' % loss_no_reg_val)
+        print('Test loss (w/o reg) on all data: %s' % test_loss_val)
+
+        print('Norm of the mean of gradients: %s' % np.linalg.norm(
+            np.concatenate([np.reshape(x, [-1, 1]) for x in grad_loss_val])))
+        print('Norm of the params: %s' % np.linalg.norm(np.concatenate([np.reshape(x, [-1, 1]) for x in params_val])))
+
+    def retrain(self, num_steps, feed_dict):
+        '''
+        retrain the model with num_steps iterations
+        num_steps: the number of iterations
+        feed_dict: the training examples
+        '''
+        for step in xrange(num_steps):
+            self.sess.run(self.train_op, feed_dict=feed_dict)
+
+    def update_learning_rate(self, step):
+        '''
+        update the learning rate
+        step: when the step or iteration is reached, update the learning rate
+        '''
+        if self.mini_batch:
+            assert self.num_train_examples % self.batch_size == 0
+            num_steps_in_epoch = self.num_train_examples / self.batch_size
+        else:
+            num_steps_in_epoch = 1
+        epoch = step // num_steps_in_epoch
+
+        multiplier = 1
+        if epoch < self.decay_epochs[0]:
+            multiplier = 1
+        elif epoch < self.decay_epochs[1]:
+            multiplier = 0.1
+        else:
+            multiplier = 0.01
+
+        self.sess.run(
+            self.update_learning_rate_op,
+            feed_dict={self.learning_rate_placeholder: multiplier * self.initial_learning_rate})
+
+    def train(self, num_steps,
+              iter_to_switch_to_batch=20000,
+              iter_to_switch_to_sgd=40000,
+              save_checkpoints=True, verbose=True,
+              ):
+        """
+        Trains a model for a specified number of steps.
+        num_steps: the number of iterations
+        iter_to_switch_to_batch: the number of iterations to switch to batch training
+        iter_to_switch_to_sgd: the number of iterations to switch to sgd optimizer
+        save_checkpoints: Whether to save the model at the checkpoints
+        verbose: whether to print the message during the training
+        """
+        if verbose: print('Training for %s steps' % num_steps)
+        self.num_train_step = num_steps
+        self.iter_to_switch_to_batch = iter_to_switch_to_batch
+        self.iter_to_switch_to_sgd = iter_to_switch_to_sgd
+
+        sess = self.sess
+
+        # Tensorboard
+        # train_writer = tf.summary.FileWriter('./logs/{}/train'.format(datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")),
+        #                                    sess.graph)
+        # tf.summary.scalar("loss", self.total_loss)
+        # merged = tf.summary.merge_all()
+        org_loss = -100
+        err = []
+        for step in range(num_steps):
+            self.update_learning_rate(step)
+
+            start_time = time.time()
+
+            if step < iter_to_switch_to_batch:
+                feed_dict = self.fill_feed_dict_with_batch(self.data_sets.train)
+                _, loss_val = sess.run([self.train_op, self.total_loss], feed_dict=feed_dict)
+                # loss_val = sess.run([self.total_loss], feed_dict=feed_dict)
+
+            elif step < iter_to_switch_to_sgd:
+                feed_dict = self.all_train_feed_dict
+                _, loss_val = sess.run([self.train_op, self.total_loss], feed_dict=feed_dict)
+
+            else:
+                feed_dict = self.all_train_feed_dict
+                _, loss_val = sess.run([self.train_sgd_op, self.total_loss], feed_dict=feed_dict)
+
+            duration = time.time() - start_time
+
+            # train_writer.add_summary(summary, step)
+
+            if (step + 1) % 1000 == 0:
+                # Print status to stdout.
+                if verbose:
+                    print('Step %d: loss = %.8f (%.3f sec)' % (step, loss_val, duration))
+                # if(abs(loss_val - org_loss) < epsilon):
+                #    break
+                # org_loss = loss_val
+            err.append((step, loss_val))
+
+            # Save a checkpoint and evaluate the model periodically.
+            if (step + 1) % 10000 == 0 or (step + 1) == num_steps:
+                if save_checkpoints: self.saver.save(sess, self.checkpoint_file, global_step=step)
+                if verbose: self.print_model_eval()
+        return err
+
+    # train_writer.flush()
+
+    def load_checkpoint(self, iter_to_load, do_checks=True):
+        '''
+        load the model at the checkpoint
+        iter_to_load: the number of iteration where the model is saved
+        do_checks: print the informaiton of the model after loading
+        '''
+        checkpoint_to_load = "%s_%s" % (self.checkpoint_file, iter_to_load)
+        self.saver.restore(self.sess, checkpoint_to_load)
+
+        if do_checks:
+            print('Model %s loaded. Sanity checks ---' % checkpoint_to_load)
+            self.print_model_eval()
+
+    def save(self, file_name, do_checks=False):
+        '''
+        save the model at the checkpoint
+        file_name: the name of the file with which the model is saved
+        do_checks: print the information of the model after loading
+        '''
+        self.saver.save(self.sess, file_name)
+        if do_checks:
+            print('Model %s sSaved. Sanity checks ---' % file_name)
+            self.print_model_eval()
+
+    def restore(self, file_name, do_checks=False):
+        '''
+        load the model at the checkpoint
+        file_name: the name of the file with which the model is saved
+        do_checks: print the information of the model after loading
+        '''
+        self.saver.restore(self.sess, file_name)
+        if do_checks:
+            print('Model %s loaded. Sanity checks ---' % file_name)
+            self.print_model_eval()
+
+    def restore_train(self, file_name, init_step,
+                      iter_to_switch_to_batch=20000,
+                      iter_to_switch_to_sgd=40000,
+                      ):
+        """
+        Trains a model for a specified number of steps.
+        file_name: the name of the file with which the model is saved
+        init_step: the threshold to train the model with different learning rate, different batches, etc.
+        iter_to_switch_to_batch: the number of iterations to switch to batch training
+        iter_to_switch_to_sgd: the number of iterations to switch to sgd optimizer
+        """
+        self.num_train_step = init_step
+        self.iter_to_switch_to_batch = iter_to_switch_to_batch
+        self.iter_to_switch_to_sgd = iter_to_switch_to_sgd
+
+        sess = self.sess
+
+        org_loss = -100
+        err = []
+
+        self.update_learning_rate(init_step)
+
+        start_time = time.time()
+
+        if init_step < iter_to_switch_to_batch:
+            feed_dict = self.fill_feed_dict_with_batch(self.data_sets.train)
+            _, loss_val = sess.run([self.train_op, self.total_loss], feed_dict=feed_dict)
+            # loss_val = sess.run([self.total_loss], feed_dict=feed_dict)
+
+        elif init_step < iter_to_switch_to_sgd:
+            feed_dict = self.all_train_feed_dict
+            _, loss_val = sess.run([self.train_op, self.total_loss], feed_dict=feed_dict)
+
+        else:
+            feed_dict = self.all_train_feed_dict
+            _, loss_val = sess.run([self.train_sgd_op, self.total_loss], feed_dict=feed_dict)
+
+        duration = time.time() - start_time
+
+        self.load_checkpoint(file_name, do_checks=True)
+        # train_writer.add_summary(summary, step)
+        print('inital loss = %.8f (%.3f sec)' % (loss_val))
+
+    def get_train_op(self, total_loss, global_step, learning_rate):
+        """
+        Return train_op
+        total_loss: the loss function to be optimized
+        global_step: the global step for the optimizer
+        learning_rate: the learning rate of the adam optimizer
+        """
+        optimizer = tf.train.AdamOptimizer(learning_rate)
+        train_op = optimizer.minimize(total_loss, global_step=global_step, )
+        return train_op
+
+    def get_train_sgd_op(self, total_loss, global_step, learning_rate=0.001):
+        """
+        Return train_sgd_op
+        total_loss: the loss function to be optimized
+        global_step: the global step for the optimizer
+        learning_rate: the learning rate of the SGD optimizer
+        """
+        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
+        train_op = optimizer.minimize(total_loss, global_step=global_step)
+        return train_op
+
+    def get_accuracy_op(self, logits, labels):
+        """Evaluate the quality of the logits at predicting the label.
+        Args:
+          logits: Logits tensor, float - [batch_size, NUM_CLASSES].
+          labels: Labels tensor, int32 - [batch_size], with values in the
+            range [0, NUM_CLASSES).
+        Returns:
+          A scalar int32 tensor with the number of examples (out of batch_size)
+          that were predicted correctly.
+        """
+        # correct = tf.nn.in_top_k(logits, labels, 1)
+        # return tf.reduce_sum(tf.cast(correct, tf.int32)) / tf.shape(labels)[0]
+        return np.NaN
+
+    def loss(self, yhat, y):
+        '''
+        the l2 norm between yhat and y. In addition, we try to remove the nan value after L2 norm computation.
+        yhat: the prediction of the label
+        y: the label
+        '''
+        indiv_loss_no_reg = tf.squared_difference(yhat, y, name='indiv_loss')
+        # indiv_loss_no_reg = tf.Print(indiv_loss_no_reg, [yhat[0], y[0], indiv_loss_no_reg[0]])
+
+        # neglect nans when do the average
+        loss_no_reg = tf.reduce_mean(tf.boolean_mask(indiv_loss_no_reg,
+                                                     tf.logical_not(tf.is_nan(indiv_loss_no_reg))),
+                                     name='loss_no_reg')
+        # loss_no_reg = tf.Print(loss_no_reg, [loss_no_reg])
+        tf.add_to_collection('losses', loss_no_reg)
+
+        total_loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
+        # total_loss = tf.Print(total_loss, [yhat[0], y[0], indiv_loss_no_reg[0], loss_no_reg, total_loss])
+
+        return total_loss, loss_no_reg, indiv_loss_no_reg
+
+    def adversarial_loss(self, logits, labels):
+        return 0, 0
+
+    def update_feed_dict_with_v_placeholder(self, feed_dict, vec):
+        for pl_block, vec_block in zip(self.v_placeholder, vec):
+            shp = pl_block.get_shape().as_list()
+            shp = [-1 if x is None else x for x in shp]
+            feed_dict[pl_block] = np.reshape(vec_block, shp)
+        return feed_dict
+
+    def get_inverse_hvp(self, v, approx_type='cg', approx_params=None, verbose=True):
+        assert approx_type in ['cg', 'lissa']
+        if approx_type == 'lissa':
+            return self.get_inverse_hvp_lissa(v, **approx_params)
+        elif approx_type == 'cg':
+            return self.get_inverse_hvp_cg(v, verbose)
+
+    def get_inverse_hvp_lissa(self, v,
+                              batch_size=None,
+                              scale=10, damping=0.0, num_samples=1, recursion_depth=10000):
+        """
+        This uses mini-batching; uncomment code for the single sample case.
+        """
+        inverse_hvp = None
+        print_iter = recursion_depth / 10
+
+        for i in range(num_samples):
+            # samples = np.random.choice(self.num_train_examples, size=recursion_depth)
+
+            cur_estimate = v
+
+            for j in range(recursion_depth):
+                feed_dict = self.fill_feed_dict_with_batch(self.data_sets.train, batch_size=batch_size)
+
+                feed_dict = self.update_feed_dict_with_v_placeholder(feed_dict, cur_estimate)
+                hessian_vector_val = self.sess.run(self.hessian_vector, feed_dict=feed_dict)
+                cur_estimate = [a + (1 - damping) * b - c / scale
+                                for (a, b, c) in zip(v, cur_estimate, hessian_vector_val)]
+
+                # Update: v + (I - Hessian_at_x) * cur_estimate
+                if (j % print_iter == 0) or (j == recursion_depth - 1):
+                    print(
+                        "Recursion at depth %s: norm is %.8lf" % (j, np.linalg.norm(self.list_to_vec((cur_estimate)))))
+                    feed_dict = self.update_feed_dict_with_v_placeholder(feed_dict, cur_estimate)
+
+            if inverse_hvp is None:
+                inverse_hvp = [b / scale for b in cur_estimate]
+            else:
+                inverse_hvp = [a + b / scale for (a, b) in zip(inverse_hvp, cur_estimate)]
+
+        inverse_hvp = [a / num_samples for a in inverse_hvp]
+        return inverse_hvp
+
+    def minibatch_hessian_vector_val(self, v):
+
+        num_examples = self.num_train_examples
+        if self.mini_batch == True:
+            batch_size = 100
+            assert num_examples % batch_size == 0
+        else:
+            batch_size = self.num_train_examples
+
+        num_iter = int(num_examples / batch_size)
+
+        self.reset_datasets()
+        hessian_vector_val = None
+        for i in range(num_iter):
+            feed_dict = self.fill_feed_dict_with_batch(self.data_sets.train, batch_size=batch_size)
+            # Can optimize this
+            feed_dict = self.update_feed_dict_with_v_placeholder(feed_dict, [v])
+
+            hessian_vector_val_temp = self.sess.run(self.hessian_vector, feed_dict=feed_dict)
+            if hessian_vector_val is None:
+                hessian_vector_val = [b / float(num_iter) for b in hessian_vector_val_temp]
+            else:
+                hessian_vector_val = [a + (b / float(num_iter)) for (a, b) in
+                                      zip(hessian_vector_val, hessian_vector_val_temp)]
+
+        hessian_vector_val = [a + self.damping * b for (a, b) in zip(hessian_vector_val, v)]
+
+        return hessian_vector_val
+
+    # def minibatch_hessian_vector_val(self, v):
+    #     # self.reset_datasets()
+    #     feed_dict = self.fill_feed_dict_with_batch(self.data_sets.train, batch_size=self.num_train_examples)
+    #     # Can optimize this
+    #     feed_dict = self.update_feed_dict_with_v_placeholder(feed_dict, [v])
+    #     # print(feed_dict)
+    #     hessian_vector_val = self.sess.run(self.hessian_vector, feed_dict=feed_dict)
+    #     # hessian_vector_val = self.sess.run(tf.add(hessian_vector_val[0], v[0] * self.damping))
+    #     # hessian_vector_val = lambda x, y:  tf.add() elems=(hessian_vector_val[0], v[0])
+    #     hessian_vector_val = [a + self.damping * b for (a, b) in zip(hessian_vector_val, v)]
+    #     return hessian_vector_val
+
+    def list_to_vec(self, l):
+        if not isinstance(l, list) and not isinstance(l, tuple):
+            return l.reshape(-1)
+        # original = [array([[8.8823157e-08], [4.6933826e-07]], dtype=float32)]
+        # after reshape -> [array([8.8823157e-08, 4.6933826e-07], dtype=float32)]
+        # after hstakc -> array([8.8823157e-08, 4.6933826e-07], dtype=float32)
+        return np.hstack([np.reshape(a, (-1)) for a in l])
+        # if not isinstance(l, list) and not isinstance(l, tuple):
+        #     return tf.reshape(l, -1)
+        # return tf.concat([tf.reshape(a, (-1)) for a in l], axis=1)
+
+    def get_fmin_loss_fn(self, v):
+        def get_fmin_loss(x):
+            hessian_vector_val = self.list_to_vec(self.minibatch_hessian_vector_val(self.vec_to_list(x)))
+            return 0.5 * np.dot(hessian_vector_val, x) - np.dot(self.list_to_vec(v), x)
+        return get_fmin_loss
+
+    def get_fmin_grad_fn(self, v):
+        def get_fmin_grad(x):
+            hessian_vector_val = self.minibatch_hessian_vector_val(self.vec_to_list(x))
+            return self.list_to_vec(hessian_vector_val) - self.list_to_vec(v)
+        return get_fmin_grad
+
+    def get_fmin_hvp(self, x, p):
+        hessian_vector_val = self.minibatch_hessian_vector_val(self.vec_to_list(p))
+        return self.list_to_vec(hessian_vector_val)
+
+    # TODO: still talking about remove a single step, need to be updated to be meaningful
+    def get_cg_callback(self, v, verbose):
+        fmin_loss_fn = self.get_fmin_loss_fn(v)
+
+        def fmin_loss_split(x):
+            hessian_vector_val = self.list_to_vec(self.minibatch_hessian_vector_val(self.vec_to_list(x)))
+            return 0.5 * np.dot(hessian_vector_val, x), -np.dot(self.list_to_vec(v), x)
+
+        def cg_callback(x):
+            # x is current params
+            v = self.vec_to_list(x)
+            idx_to_remove = 5
+            single_train_feed_dict = self.fill_feed_dict_with_one_ex(self.data_sets.train, idx_to_remove)
+            train_grad_loss_val = self.sess.run(self.grad_total_loss_op, feed_dict=single_train_feed_dict)
+            predicted_loss_diff = np.dot(self.list_to_vec(v),
+                                         self.list_to_vec(train_grad_loss_val)) / self.num_train_examples
+            if verbose:
+                print('Function value: %s' % fmin_loss_fn(x))
+                quad, lin = fmin_loss_split(x)
+                print('Split function value: %s, %s' % (quad, lin))
+                print('Predicted loss diff on train_idx %s: %s' % (idx_to_remove, predicted_loss_diff))
+
+        return cg_callback
+
+    def get_inverse_hvp_cg(self, v, verbose):
+
+        fmin_loss_fn = self.get_fmin_loss_fn(v)
+        fmin_grad_fn = self.get_fmin_grad_fn(v)
+        cg_callback = self.get_cg_callback(v, verbose)
+
+        fmin_results = fmin_ncg(
+            f=fmin_loss_fn,
+            x0=np.concatenate([np.reshape(x, [-1, 1]) for x in v]),
+            fprime=fmin_grad_fn,
+            fhess_p=self.get_fmin_hvp,
+            callback=cg_callback,
+            avextol=1e-8,
+            maxiter=100000,
+            disp=verbose
+        )
+        # x0 = np.concatenate([np.reshape(x, [-1, 1]) for x in v])
+        # feed_dict = self.fill_feed_dict_with_all_ex(self.data_sets.train)
+        # for pl_block, vec_block in zip(self.v_placeholder, [x0]):
+        #     feed_dict[pl_block] = vec_block
+        # # return self.vec_to_list(fmin_results)
+        # print('hessian_vector_val={} from hvp_cg'.format(self.sess.run(self.hessian_vector, feed_dict=feed_dict)))
+        return fmin_results
+
+    def get_inverse_hvp_cg_new(self, v, verbose):
+        # self.reset_datasets()
+        feed_dict = self.fill_feed_dict_with_all_ex(self.data_sets.train)
+        for pl_block, vec_block in zip(self.v_placeholder, v):
+            feed_dict[pl_block] = vec_block
+        for ii in range(100):
+            self.sess.run(self.training_process, feed_dict=feed_dict)
+        return self.sess.run(self.t / 2)
+
+    def get_gradient(self, gradient_op, data_set, indices=None, batch_size=100):
+        if indices is None:
+            indices = range(len(data_set.labels))
+
+        num_iter = int(np.ceil(len(indices) / batch_size))
+
+        grad_loss_no_reg_val = None
+        for i in range(num_iter):
+            start = i * batch_size
+            end = int(min((i + 1) * batch_size, len(indices)))
+
+            test_feed_dict = self.fill_feed_dict_with_some_ex(data_set, indices[start:end])
+            temp = self.sess.run(operate_deep(tf.convert_to_tensor,
+                                              gradient_op), feed_dict=test_feed_dict)
+
+            if grad_loss_no_reg_val is None:
+                grad_loss_no_reg_val = operate_deep(lambda a: a * (end - start), temp)
+            else:
+                grad_loss_no_reg_val = \
+                    [operate_deep_2v(lambda a, b: a + b * (end - start), x, y)
+                     for (x, y) in zip(grad_loss_no_reg_val, temp)]
+
+        grad_loss_no_reg_val = operate_deep(lambda a: a / len(indices), grad_loss_no_reg_val)
+
+        return grad_loss_no_reg_val
+
+    def get_hessian(self, data_set, indices=None, batch_size=100):
+        if indices is None:
+            indices = range(len(data_set.labels))
+
+        num_iter = int(np.ceil(len(indices) / batch_size))
+
+        grad_loss_no_reg_val = None
+        for i in range(num_iter):
+            start = i * batch_size
+            end = int(min((i + 1) * batch_size, len(indices)))
+
+            test_feed_dict = self.fill_feed_dict_with_some_ex(data_set, indices[start:end])
+            temp = self.sess.run(self.hessian, feed_dict=test_feed_dict)
+
+            if grad_loss_no_reg_val is None:
+                grad_loss_no_reg_val = operate_deep(lambda a: a * (end - start), temp)
+            else:
+                grad_loss_no_reg_val = \
+                    [operate_deep_2v(lambda a, b: a + b * (end - start), x, y)
+                     for (x, y) in zip(grad_loss_no_reg_val, temp)]
+
+        grad_loss_no_reg_val = operate_deep(lambda a: a / len(indices), grad_loss_no_reg_val)
+
+        return grad_loss_no_reg_val
+
+    def get_ich(self, y, data_set, approx_type='cg', approx_params=None, verbose=True):
+        # aaa = self.hessian_inverse_vector_product(data_set)
+        iter = 300
+        init = tf.global_variables_initializer()
+        self.sess.run(init)
+        data_set = copy.copy(data_set)
+        data_set.reset_batch()
+        data_set.reference_ts = y
+        psi_y = self.sess.run(self.grad_total_loss_op, feed_dict={self.ts_placeholder: y,
+                                                                  self.input_index_placeholder: data_set.x,
+                                                                  self.labels_index_placeholder: data_set.labels})
+        feed_dict = self.fill_feed_dict_with_all_ex(self.data_sets.train)
+        for pl_block, vec_block in zip(self.v_placeholder, psi_y):
+            feed_dict[pl_block] = vec_block
+        for jj in range(iter):
+            self.sess.run(self.training_process, feed_dict=feed_dict)
+        ich = self.sess.run(self.t)
+
+        # validations
+        # if verbose:
+        #    print("To validate the inverse hvp, the following to list should be very close: ")
+        #    print(psi_y)
+        #    print(self.minibatch_hessian_vector_val(ich))
+
+        return ich
+
+    def __loop_over_gamma(self, series, y_contaminate, gamma, seed, fun):
+        n_time_stamp = self.n_time_stamp
+        n_non_nan = self.n_non_nan_y_cont
+        series = series.copy()
+        np.random.seed(seed)
+        condition = gamma * n_time_stamp / n_non_nan
+        if condition > 1:
+            a = np.empty(series.shape)
+            a.fill(np.nan)
+            res = fun(a)
+            return np.insert(res, 0, gamma)
+        idx = np.random.binomial(size=n_time_stamp, n=1, p=condition).astype(bool)
+        idx = np.logical_and(idx, self.non_nan_y_cont_idx)
+        if sum(idx) > 0:
+            series[idx, :] = y_contaminate[idx, np.newaxis]
+        # return series, gamma
+        res = fun(series)
+        # print(gamma, seed)
+        return np.insert(res, 0, gamma)
+
+    # gamma=0
+    def __loop_over_gamma_0(self, series, y_contaminate, gamma, seed, fun):
+        n_time_stamp = self.n_time_stamp
+        series = series.copy()
+        np.random.seed(seed)
+        idx = np.random.binomial(size=n_time_stamp, n=1, p=0).astype(bool)
+        idx = np.logical_and(idx, self.non_nan_y_cont_idx)
+        if sum(idx) > 0:
+            series[idx, :] = y_contaminate[idx, np.newaxis]
+        res = fun(series)
+        # replace res = fun(series) with the original ich function
+        # data_set = copy.copy(self.data_sets.train)
+        # data_set.reset_batch()
+        # data_set.reference_ts = series
+        # psi_y = self.sess.run(self.grad_total_loss_op, feed_dict=
+        # {self.ts_placeholder: series,
+        #  self.input_index_placeholder: data_set.x,
+        #  self.labels_index_placeholder: data_set.labels})
+        # feed_dict = self.fill_feed_dict_with_all_ex(self.data_sets.train)
+        # for pl_block, vec_block in zip(self.v_placeholder, psi_y):
+        #     feed_dict[pl_block] = vec_block
+        # iters = 300
+        # for ii in range(iters):
+        #     self.sess.run(self.training_process, feed_dict=feed_dict)
+        # res = self.sess.run(self.t)
+        # validations
+        # if True:
+        #     print("gmma0: To validate the inverse hvp, the following to list should be very close: ")
+        #     print(psi_y)
+        #     print(self.minibatch_hessian_vector_val(res))
+        return np.insert(res, 0, gamma)
+
+    def __expection_over_gamma(self, fun, y_contaminate, gammas, expectation_rep_time, verbose):
+        # for gamma = 0, no need to repeat
+        if 0 in gammas:
+            fun_v_0 = self.__loop_over_gamma_0(self.time_series, y_contaminate, 0, 0, fun)
+            fun_v_0 = [fun_v_0] * expectation_rep_time
+        else:
+            fun_v_0 = []
+        large_gamma = self.large_gamma
+        rest_gamma = self.rest_gamma
+        if len(large_gamma) > 0:
+            series = self.time_series.copy()
+            idx = self.non_nan_y_cont_idx
+            series[idx, :] = y_contaminate[idx, np.newaxis]
+            res = fun(series)
+            for gamma in large_gamma:
+                fun_v_0 += [np.insert(res, 0, gamma)] * expectation_rep_time
+        fun_v = []
+        # start_time = time.time()
+        # series_gamma_pair = []
+        for gamma in rest_gamma:
+            for i in range(expectation_rep_time):
+                fun_v.append(self.__loop_over_gamma(self.time_series, y_contaminate, gamma, i, fun))
+                # series_gamma_pair.append((a, b))
+        # fun_v = fun(series_gamma_pair)
+        # end_time = time.time()
+        # print('{} s to compute nested for loop'.format(end_time - start_time))
+        df = pd.DataFrame(fun_v_0 + fun_v)
+        df_expect_value = df.groupby(df.columns[0]).agg(np.mean)
+        slopes = []
+        for i in range(df_expect_value.shape[1]):
+            slope, _, r_value, _, _ = linregress(df_expect_value.index.values,
+                                                 df_expect_value.iloc[:, i].values)
+            if verbose:
+                print("Regression R-squared: %f" % r_value ** 2)
+            elif r_value ** 2 < 0.3:
+                print("Regression R-squared: %f" % r_value ** 2)
+            slopes.append(slope)
+        return np.array(slopes)
+
+    def get_if(self, y_contaminate, gammas=None, expectation_rep_time=10, verbose=True):
+        return self.__expection_over_gamma(
+            lambda s: self.get_ich(s, self.data_sets.train, verbose=verbose),
+            y_contaminate, gammas, expectation_rep_time, verbose)
+
+    def partial_m_partial_gamma(self, index, short_memo, patchy_k, y_contaminate):  # lemma 3.3
+        def get_diff(series, index, act_pred):
+            return self.sess.run(self.preds, \
+                                 self.fill_feed_dict_with_one_ex(self.data_sets.test, index,
+                                                                 series))[0, 0] - act_pred[0, 0]
+
+        pred_gamma = 0.0
+        act_pred = self.sess.run(self.preds, \
+                                 self.fill_feed_dict_with_one_ex(self.data_sets.test, index))
+        if short_memo:
+            zero_cnt = 0
+        max_train = self.data_sets.test.labels[index][0]
+        for m in range(max_train - 1, -1, -1):
+            series = self.time_series.copy()
+            idx = np.arange(m, np.min([m + patchy_k, max_train]))
+
+            replace_by = y_contaminate[idx, np.newaxis]
+            nan_idx = np.isnan(replace_by)
+            if nan_idx.sum() > 0:  # random sample and get the average
+                dif = []
+                for i in range(200):
+                    replace_by[nan_idx] = \
+                        np.random.choice(y_contaminate[np.logical_not(np.isnan(y_contaminate))], nan_idx.sum())
+                    series[idx] = replace_by
+                    dif.append(get_diff(series, index, act_pred))
+                dif = np.mean(dif)
+            else:
+                series[idx] = replace_by
+                dif = get_diff(series, index, act_pred)
+
+            if short_memo:  # not long memory and should be OK to assume far away points has no impact on current values
+                if abs(dif) > 1e-5:
+                    zero_cnt = 0
+                else:
+                    zero_cnt += 1
+                    if zero_cnt > 10:  # has no contribution for 10 timestamps
+                        break
+
+            pred_gamma += dif
+        return pred_gamma / patchy_k
+
+    def get_sif(self, y_contaminate, index, patchy_k,
+                gammas=None, expectation_rep_time=10,
+                verbose=True, is_ar=False, short_memo=True):
+        '''
+        y_contaminated: contaminating process
+        '''
+        if gammas is None:
+            gammas = np.arange(0.0, 0.09, 0.01)
+
+        # IF value
+        init = tf.global_variables_initializer()
+        self.sess.run(init)
+        start_time = time.time()
+        if_v = self.get_if(y_contaminate, gammas, expectation_rep_time, verbose)
+        end_time = time.time()
+        print('{} s to compute if_v'.format(end_time - start_time))
+        start_time = time.time()
+        # Pred Over Gamma
+        if patchy_k is not None:  # lemma 3.3
+            pred_gamma = self.partial_m_partial_gamma(index, short_memo, patchy_k, y_contaminate)
+            end_time = time.time()
+            print('{} s to compute patchy pred gamma'.format(end_time - start_time))
+        elif is_ar:  # lemma 3.4
+            pred_gamma = self.sess.run(
+                (-tf.reduce_mean(self.labels_placeholder) + np.nanmean(y_contaminate)) * tf.reduce_sum(self.params),
+                self.fill_feed_dict_with_all_ex(self.data_sets.train))
+            end_time = time.time()
+            print('{} s to compute ar pred gamma'.format(end_time - start_time))
+        else:
+            pred_gamma = self.__expection_over_gamma(
+                lambda s: self.sess.run(self.preds,
+                                        self.fill_feed_dict_with_one_ex(self.data_sets.test, index, s)),
+                y_contaminate,
+                gammas, expectation_rep_time,
+                verbose)
+            pred_gamma = pred_gamma[0]
+            end_time = time.time()
+            print('{} s to compute third type pred gamma'.format(end_time - start_time))
+        start_time = time.time()
+        #
+        # psi_y = self.sess.run(self.grad_total_loss_op, feed_dict=
+        #                 {self.ts_placeholder: y,
+        #                  self.input_index_placeholder: data_set.x,
+        #                  self.labels_index_placeholder: data_set.labels})
+        # Pred Over Theta
+        psi_y = self.sess.run(tf.gradients(self.preds, self.params),
+                              self.fill_feed_dict_with_one_ex(self.data_sets.test, index))
+        end_time = time.time()
+        print('{} s to compute psi_y'.format(end_time - start_time))
+        res = pred_gamma + np.dot(self.list_to_vec(psi_y), if_v)
+        return res
+
+    def find_eigvals_of_hessian(self, num_iter=100, num_prints=10):
+
+        # Setup
+        print_iterations = num_iter / num_prints
+        feed_dict = self.fill_feed_dict_with_one_ex(self.data_sets.train, 0)
+
+        # Initialize starting vector
+        grad_loss_val = self.sess.run(self.grad_total_loss_op, feed_dict=feed_dict)
+        initial_v = []
+
+        for a in grad_loss_val:
+            initial_v.append(np.random.random(a.shape))
+        initial_v, norm_val = normalize_vector(initial_v)
+
+        # Do power iteration to find largest eigenvalue
+        print('Starting power iteration to find largest eigenvalue...')
+
+        largest_eig = norm_val
+        print('Largest eigenvalue is %s' % largest_eig)
+
+        # Do power iteration to find smallest eigenvalue
+        print('Starting power iteration to find smallest eigenvalue...')
+        cur_estimate = initial_v
+        dotp = -1.0
+        for i in range(num_iter):
+            cur_estimate, norm_val = normalize_vector(cur_estimate)
+            hessian_vector_val = self.minibatch_hessian_vector_val(cur_estimate)
+            new_cur_estimate = [a - largest_eig * b for (a, b) in zip(hessian_vector_val, cur_estimate)]
+
+            if i % print_iterations == 0:
+                print(-norm_val + largest_eig)
+                dotp = np.dot(np.concatenate(new_cur_estimate), np.concatenate(cur_estimate))
+                print("dot: %s" % dotp)
+            cur_estimate = new_cur_estimate
+
+        smallest_eig = -norm_val + largest_eig
+        assert dotp < 0, "Eigenvalue calc failed to find largest eigenvalue"
+
+        print('Largest eigenvalue is %s' % largest_eig)
+        print('Smallest eigenvalue is %s' % smallest_eig)
+        return largest_eig, smallest_eig
+
+    def update_train_x(self, new_train_x):
+        assert np.all(new_train_x.shape == self.data_sets.train.x.shape)
+        new_train = DataSet(new_train_x, np.copy(self.data_sets.train.labels))
+        self.data_sets = self.Datasets(train=new_train, test=self.data_sets.test)
+        self.all_train_feed_dict = self.fill_feed_dict_with_all_ex(self.data_sets.train)
+        self.reset_datasets()
+
+    def update_train_x_y(self, new_train_x, new_train_y):
+        new_train = DataSet(new_train_x, new_train_y)
+        self.data_sets = self.Datasets(train=new_train, test=self.data_sets.test)
+        self.all_train_feed_dict = self.fill_feed_dict_with_all_ex(self.data_sets.train)
+        self.num_train_examples = len(new_train_y)
+        self.reset_datasets()
+
+    def update_test_x_y(self, new_test_x, new_test_y):
+        new_test = DataSet(new_test_x, new_test_y)
+        self.data_sets = self.Datasets(train=self.data_sets.train, test=new_test)
+        self.all_test_feed_dict = self.fill_feed_dict_with_all_ex(self.data_sets.test)
+        self.num_test_examples = len(new_test_y)
+        self.reset_datasets()
+
+    @abstractmethod
+    def get_all_params(self):
+        pass
+
+    @abstractmethod
+    def predictions(self, logit):
+        pass
+
+    @abstractmethod
+    def placeholder_inputs(self):
+        pass
+
+    @abstractmethod
+    def inference(self, input_x, labels_placeholder=None, keep_probs_placeholder=None):
+        pass
+
diff --git a/aix360/algorithms/sif/SIF_NN.py b/aix360/algorithms/sif/SIF_NN.py
new file mode 100644
index 0000000..bb5b2a6
--- /dev/null
+++ b/aix360/algorithms/sif/SIF_NN.py
@@ -0,0 +1,201 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import tensorflow as tf
+from aix360.algorithms.sif.SIF import SIFExplainer
+from aix360.datasets.SIF_dataset import DataSet
+
+
+class AllAR(SIFExplainer):
+    def __init__(self, x_dim, y_dim, time_steps, share_param, **kwargs):
+        self.time_steps = time_steps
+        self.x_dim = x_dim
+        self.cells = None
+        self.y_dim = y_dim
+        self.share_param = share_param
+        if share_param:
+            self.out_weights = tf.Variable(tf.random_normal([self.time_steps, 1]))
+        else:
+            self.out_weights = tf.Variable(tf.random_normal([self.time_steps, self.y_dim]))
+        super().__init__(**kwargs)
+
+    def get_all_params(self):
+        all_params = []
+        all_params.append(self.out_weights)
+        return all_params
+
+    def retrain(self, num_steps, feed_dict):
+        retrain_dataset = DataSet(feed_dict[self.input_index_placeholder], feed_dict[self.labels_index_placeholder])
+        for step in range(num_steps):
+            iter_feed_dict = self.fill_feed_dict_with_batch(retrain_dataset)
+            self.sess.run(self.train_op, feed_dict=iter_feed_dict)
+
+    def placeholder_inputs(self):
+        input_index_placeholder = tuple([tf.placeholder(
+            tf.int32,
+            shape=(None, 1),
+            name='input_index_placeholder_{}'.format(i)) for i in range(self.time_steps)])
+        labels_index_placeholder = tf.placeholder(
+            tf.int32,
+            shape=(None, 1),
+            name='labels_index_placeholder')
+        ts_placeholder = tf.placeholder(
+            tf.float32,
+            shape=[None, self.y_dim],
+            name='input_ts')
+        return input_index_placeholder, labels_index_placeholder, ts_placeholder
+
+    def inference(self, input_x, labels_placeholder=None, keep_probs_placeholder=None):
+        if self.share_param:
+            weight = tf.tile(self.out_weights, [1, self.y_dim], name='Weight')
+        else:
+            weight = self.out_weights
+        x = tf.stack([x[:, 0] for x in input_x], axis=1, name='x')
+        y_hat = tf.einsum('ijk,jk->ik', x, weight, name='y_hat')
+        return y_hat
+
+    def predictions(self, logits):
+        preds = logits
+        return preds
+    
+
+class AllLSTM(SIFExplainer):
+    def __init__(self, x_dim, y_dim, time_steps, num_units, share_param, **kwargs):
+        self.time_steps = time_steps
+        self.x_dim = x_dim
+        self.num_units = num_units
+        self.cells = None
+        self.y_dim = y_dim
+        self.share_param = share_param
+        if share_param:
+            self.out_weights = tf.Variable(tf.random_normal([self.num_units, 1]))
+            self.out_bias = tf.Variable(tf.random_normal([1, 1]))
+        else:
+            self.out_weights = tf.Variable(tf.random_normal([self.num_units, self.y_dim]))
+            self.out_bias = tf.Variable(tf.random_normal([1, self.y_dim]))
+        super().__init__(**kwargs)
+
+    def get_all_params(self):
+        all_params = []
+        lstm_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="LSTM")
+        all_params += lstm_variables
+        all_params.append(self.out_weights)
+        all_params.append(self.out_bias)
+        return all_params
+
+    def retrain(self, num_steps, feed_dict):
+        retrain_dataset = DataSet(feed_dict[self.input_index_placeholder], feed_dict[self.labels_index_placeholder])
+        for step in range(num_steps):
+            iter_feed_dict = self.fill_feed_dict_with_batch(retrain_dataset)
+            self.sess.run(self.train_op, feed_dict=iter_feed_dict)
+
+
+    def placeholder_inputs(self):
+        input_index_placeholder = tuple([tf.placeholder(
+            tf.int32,
+            shape=(None, 1),
+            name='input_index_placeholder_{}'.format(i)) for i in range(self.time_steps)])
+        labels_index_placeholder = tf.placeholder(
+            tf.int32,
+            shape=(None, 1),
+            name='labels_index_placeholder')
+        ts_placeholder = tf.placeholder(
+            tf.float32,
+            shape=[None, self.y_dim],
+            name='input_ts')
+        return input_index_placeholder, labels_index_placeholder, ts_placeholder
+
+    def inference(self, input_x, labels_placeholder=None, keep_probs_placeholder=None):
+        if isinstance(input_x, list) | isinstance(input_x, tuple):
+            n = input_x[0].shape[2]
+            x = [tuple(x0[:, :, i] for x0 in input_x) for i in range(n)]
+        else:
+            n = input_x.shape[2]
+            x = [input_x[:, :, i] for i in range(n)]
+        with tf.variable_scope("LSTM") as vs:
+            cell = tf.nn.rnn_cell.BasicLSTMCell(self.num_units, name='LSTM_Layer')
+            def run_lstm(x_n):
+                output, _ = tf.nn.static_rnn(cell, x_n, dtype=tf.float32)
+                return tf.matmul(output[-1], self.out_weights) + self.out_bias
+            res = tf.stack(list(map(run_lstm, x)), axis=1)[:, :, 0]
+        return res
+
+    def predictions(self, logits):
+        preds = logits
+        return preds
+
+
+class AllRNN(SIFExplainer):
+    def __init__(self, x_dim, y_dim, time_steps, num_units, share_param, **kwargs):
+        self.time_steps = time_steps
+        self.x_dim = x_dim
+        self.num_units = num_units
+        self.cells = None
+        self.y_dim = y_dim
+        self.share_param = share_param
+        if share_param:
+            self.out_weights = tf.Variable(tf.random_normal([self.num_units, 1]))
+            self.out_bias = tf.Variable(tf.random_normal([1, 1]))
+        else:
+            self.out_weights = tf.Variable(tf.random_normal([self.num_units, self.y_dim]))
+            self.out_bias = tf.Variable(tf.random_normal([1, self.y_dim]))
+        super().__init__(**kwargs)
+
+    def get_all_params(self):
+        all_params = []
+        rnn_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="RNN")
+        all_params += rnn_variables
+        all_params.append(self.out_weights)
+        all_params.append(self.out_bias)
+        return all_params
+
+    def retrain(self, num_steps, feed_dict):
+        retrain_dataset = DataSet(feed_dict[self.input_index_placeholder], feed_dict[self.labels_index_placeholder])
+        for step in range(num_steps):
+            iter_feed_dict = self.fill_feed_dict_with_batch(retrain_dataset)
+            self.sess.run(self.train_op, feed_dict=iter_feed_dict)
+
+    def placeholder_inputs(self):
+        input_index_placeholder = tuple([tf.placeholder(
+            tf.int32,
+            shape=(None, 1),
+            name='input_index_placeholder_{}'.format(i)) for i in range(self.time_steps)])
+        labels_index_placeholder = tf.placeholder(
+            tf.int32,
+            shape=(None, 1),
+            name='labels_index_placeholder')
+        ts_placeholder = tf.placeholder(
+            tf.float32,
+            shape=[None, self.y_dim],
+            name='input_ts')
+        return input_index_placeholder, labels_index_placeholder, ts_placeholder
+
+    def inference(self, input_x, labels_placeholder=None, keep_probs_placeholder=None):
+        from tensorflow.keras import layers
+        model = tf.keras.Sequential()
+        model.add(layers.Embedding(input_dim=1000, output_dim=64))
+        # The output of GRU will be a 3D tensor of shape (batch_size, timesteps, 256)
+        model.add(layers.GRU(256, return_sequences=True))
+        # The output of SimpleRNN will be a 2D tensor of shape (batch_size, 128)
+        model.add(layers.SimpleRNN(128))
+        model.add(layers.Dense(10, activation='softmax'))
+        model.summary()
+        if isinstance(input_x, list) | isinstance(input_x, tuple):
+            n = input_x[0].shape[2]
+            x = [tuple(x0[:, :, i] for x0 in input_x) for i in range(n)]
+        else:
+            n = input_x.shape[2]
+            x = [input_x[:, :, i] for i in range(n)]
+        with tf.variable_scope("RNN") as vs:
+            cell = tf.nn.rnn_cell.BasicLSTMCell(self.num_units, name='RNN_Layer')
+            def run_rnn(x_n):
+                output, _ = tf.nn.static_rnn(cell, x_n, dtype=tf.float32)
+                return tf.matmul(output[-1], self.out_weights) + self.out_bias
+
+            res = tf.stack(list(map(run_rnn, x)), axis=1)[:, :, 0]
+        return res
+
+    def predictions(self, logits):
+        preds = logits
+        return preds
diff --git a/aix360/algorithms/sif/SIF_utils.py b/aix360/algorithms/sif/SIF_utils.py
new file mode 100644
index 0000000..b52e10a
--- /dev/null
+++ b/aix360/algorithms/sif/SIF_utils.py
@@ -0,0 +1,331 @@
+### Adapted from TF repo
+
+from tensorflow import gradients
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+import tensorflow as tf
+import numpy as np
+from tensorflow.contrib.rnn import RNNCell, LSTMCell
+import statsmodels.tsa.api as smt
+
+
+def get_contaminate_series(core_series, y_contaminate, train_label, gamma=0.1):
+    '''
+    gamma: the percentage of the contaminated sample insert into the orginal time sequence.
+    '''
+    series = core_series.copy()
+    n_timestamp = train_label.shape[0]
+    idx = np.random.binomial(size=n_timestamp, n=1, p=gamma).astype(bool)
+    idx = np.where(idx)
+    num_to_chg = len(idx)
+    if num_to_chg > 0:
+        series[idx, :] = y_contaminate[idx, np.newaxis]
+    return series
+
+
+def sample_Z(batch_size, seq_length, latent_dim):
+    sample = np.float32(np.random.normal(size=[batch_size, seq_length, latent_dim]))
+    return sample
+
+
+def generator_lstm(z, hidden_units_g, seq_length, batch_size, num_signals, reuse=False, parameters=None):
+    """
+    If parameters are supplied, initialise as such
+    """
+    with tf.variable_scope("generator") as scope:
+        if reuse:
+            scope.reuse_variables()
+        if parameters is None:
+            W_out_G_initializer = tf.truncated_normal_initializer()
+            b_out_G_initializer = tf.truncated_normal_initializer()
+            lstm_initializer = None
+
+        else:
+            W_out_G_initializer = tf.constant_initializer(value=parameters['generator/W_out_G:0'])
+            b_out_G_initializer = tf.constant_initializer(value=parameters['generator/b_out_G:0'])
+
+            lstm_initializer = tf.constant_initializer(value=parameters['generator/rnn/lstm_cell/weights:0'])
+            bias_start = parameters['generator/rnn/lstm_cell/biases:0']
+
+        W_out_G = tf.get_variable(name='W_out_G', shape=[hidden_units_g, num_signals],
+                                  initializer=W_out_G_initializer)
+        b_out_G = tf.get_variable(name='b_out_G', shape=num_signals, initializer=b_out_G_initializer)
+
+        # inputs
+        inputs = z
+
+        cell = LSTMCell(num_units=hidden_units_g,
+                        state_is_tuple=True,
+                        initializer=lstm_initializer,
+                        reuse=reuse)
+        rnn_outputs, rnn_states = tf.nn.dynamic_rnn(
+            cell=cell,
+            dtype=tf.float32,
+            sequence_length=[seq_length] * batch_size,
+            inputs=inputs)
+        rnn_outputs = rnn_outputs[:, -1, :]
+        output = tf.matmul(rnn_outputs, W_out_G) + b_out_G  # output weighted sum
+
+        m = tf.reduce_mean(output, axis=0)
+        std = tf.sqrt(tf.reduce_mean(tf.square(output-m)))
+    return (output - m)/std
+
+
+def generator_arma(params, p, q, nsample):
+    """
+    If parameters are supplied, initialise as such
+    """
+    assert len(params) == p+q, "The length of the parameters must equals p+q"
+    ar = np.r_[1, -params[:p]]
+    ma = np.r_[1, params[p:]]
+    output = smt.arma_generate_sample(ar, ma, nsample, burnin=200)
+    m = np.mean(output)
+    std = np.std(output)
+    if std < 1e-9:
+        std = 1
+    return (output - m) / std
+
+
+def hessian_vector_product(ys, xs, v):
+    """Multiply the Hessian of `ys` wrt `xs` by `v`.
+    This is an efficient construction that uses a backprop-like approach
+    to compute the product between the Hessian and another vector. The
+    Hessian is usually too large to be explicitly computed or even
+    represented, but this method allows us to at least multiply by it
+    for the same big-O cost as backprop.
+    Implicit Hessian-vector products are the main practical, scalable way
+    of using second derivatives with neural networks. They allow us to
+    do things like construct Krylov subspaces and approximate conjugate
+    gradient descent.
+    Example: if `y` = 1/2 `x`^T A `x`, then `hessian_vector_product(y,
+    x, v)` will return an expression that evaluates to the same values
+    as (A + A.T) `v`.
+    Args:
+      ys: A scalar value, or a tensor or list of tensors to be summed to
+          yield a scalar.
+      xs: A list of tensors that we should construct the Hessian over.
+      v: A list of tensors, with the same shapes as xs, that we want to
+         multiply by the Hessian.
+    Returns:
+      A list of tensors (or if the list would be length 1, a single tensor)
+      containing the product between the Hessian and `v`.
+    Raises:
+      ValueError: `xs` and `v` have different length.
+    """
+
+
+    # Validate the input
+    length = len(xs)
+    if len(v) != length:
+        raise ValueError("xs and v must have the same length.")
+
+    # First backprop
+    grads = gradients(ys, xs)
+
+    # grads = xs
+
+    assert len(grads) == length
+
+    elemwise_products = [math_ops.multiply(grad_elem, array_ops.stop_gradient(v_elem))
+        for grad_elem, v_elem in zip(grads, v) if grad_elem is not None
+    ]
+
+    # Second backprop
+    grads_with_none = gradients(elemwise_products, xs)
+    return_grads = [
+        grad_elem if grad_elem is not None \
+            else tf.zeros_like(x) \
+        for x, grad_elem in zip(xs, grads_with_none)]
+
+    return return_grads
+
+
+def _AsList(x):
+    return x if isinstance(x, (list, tuple)) else [x]
+
+
+def gradient_reshape(ys, x, **kwargs):
+    # Compute the partial derivatives of the input with respect to all
+    # elements of `x`
+    _gradients = tf.gradients(ys, x, **kwargs)[0]
+    # for higher dimension, let's conver then into a vector
+    _gradients = tf.reshape(_gradients, [-1])
+    return _gradients
+
+
+def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
+             gate_gradients=False, aggregation_method=None):
+    """Constructs the Hessian of sum of `ys` with respect to `x` in `xs`.
+    `hessians()` adds ops to the graph to output the Hessian matrix of `ys`
+    with respect to `xs`.    It returns a list of `Tensor` of length `len(xs)`
+    where each tensor is the Hessian of `sum(ys)`. This function currently
+    only supports evaluating the Hessian with respect to (a list of) one-
+    dimensional tensors.
+    The Hessian is a matrix of second-order partial derivatives of a scalar
+    tensor (see https://en.wikipedia.org/wiki/Hessian_matrix for more details).
+    Args:
+        ys: A `Tensor` or list of tensors to be differentiated.
+        xs: A `Tensor` or list of tensors to be used for differentiation.
+        name: Optional name to use for grouping all the gradient ops together.
+            defaults to 'hessians'.
+        colocate_gradients_with_ops: See `gradients()` documentation for details.
+        gate_gradients: See `gradients()` documentation for details.
+        aggregation_method: See `gradients()` documentation for details.
+    Returns:
+        A list of Hessian matrices of `sum(y)` for each `x` in `xs`.
+    Raises:
+        LookupError: if one of the operations between `xs` and `ys` does not
+            have a registered gradient function.
+        ValueError: if the arguments are invalid or not supported. Currently,
+            this function only supports one-dimensional `x` in `xs`.
+    """
+    # import time
+    # start_time = time.time()
+
+    xs = _AsList(xs)
+    kwargs = {
+            'colocate_gradients_with_ops': colocate_gradients_with_ops,
+            'gate_gradients': gate_gradients,
+            'aggregation_method': aggregation_method
+    }
+    # Compute a hessian matrix for each x in xs
+    hessians = []
+    for x in xs:
+        # Check dimensions
+        ndims = x.get_shape().ndims
+        if ndims is None:
+            raise ValueError('Cannot compute Hessian because the dimensionality of '
+                                             'element number %d of `xs` cannot be determined' % i)
+        #elif ndims != 1:
+        #    raise ValueError('Computing hessians is currently only supported for '
+        #                                     'one-dimensional tensors. Element number %d of `xs` has '
+        #                                     '%d dimensions.' % (i, ndims))
+        with ops.name_scope(name + '_first_derivative'):
+            _gradients = gradient_reshape(ys, x, **kwargs)
+
+            # Unpack the gradients into a list so we can take derivatives with
+            # respect to each element
+            _gradients = tf.unstack(_gradients, _gradients.get_shape()[0].value)
+        row = []
+        with ops.name_scope(name + '_second_derivative'):
+            for xp in xs:
+                # Compute the partial derivatives with respect to each element of the list
+                _hess = [gradient_reshape(_gradient, xp, **kwargs) for _gradient in _gradients]
+                #_hess = tf.gradients(_gradients, x, **kwargs)[0]
+                # Pack the list into a matrix and add to the list of hessians
+                row.append(tf.stack(_hess, name=name))
+        hessians.append(row)
+    # end_time = time.time()
+    # print('{} s to compute hessian'.format(end_time - start_time))
+    return hessians
+
+
+def derivative_x1_x2(ys, xs1, xs2, name="Second_order_derivative",
+                     colocate_gradients_with_ops=False,
+            gate_gradients=False, aggregation_method=None):
+
+    xs1 = _AsList(xs1)
+    xs2 = _AsList(xs2)
+    kwargs = {
+      'colocate_gradients_with_ops': colocate_gradients_with_ops,
+      'gate_gradients': gate_gradients,
+      'aggregation_method': aggregation_method
+    }
+    # Compute a hessian matrix for each x in xs
+    derivatives = []
+    for i, x in enumerate(xs1):
+        # Check dimensions
+        ndims = x.get_shape().ndims
+        if ndims is None:
+            raise ValueError('Cannot compute Hessian because the dimensionality of '
+                            'element number %d of `xs` cannot be determined' % i)
+        #elif ndims != 1:
+        #  raise ValueError('Computing hessians is currently only supported for '
+        #                   'one-dimensional tensors. Element number %d of `xs` has '
+        #                   '%d dimensions.' % (i, ndims))
+        with ops.name_scope(name + '_first_derivative'):
+            _gradients = gradient_reshape(ys, x, **kwargs)
+
+            # Unpack the gradients into a list so we can take derivatives with
+            # respect to each element
+            _gradients = array_ops.unpack(_gradients, _gradients.get_shape()[0].value)
+
+        _derivative = []
+        for x2 in xs2:
+            with ops.name_scope(name + '_second_derivative'):
+                # Compute the partial derivatives with respect to each element of the list
+                _hess = [gradient_reshape(_gradient, x2, **kwargs) for _gradient in _gradients]
+                #_hess = tf.gradients(_gradients, x, **kwargs)[0]
+                # Pack the list into a matrix and add to the list of hessians
+            _derivative.append(array_ops.pack(_hess, name=name))
+        derivatives.append(_derivative)
+
+    return derivatives
+
+
+def variable(name, shape, initializer):
+    dtype = tf.float32
+    var = tf.get_variable(
+        name,
+        shape,
+        initializer=initializer,
+        dtype=dtype)
+    return var
+
+
+def variable_with_weight_decay(name, shape, stddev, wd):
+    """Helper to create an initialized Variable with weight decay.
+    Note that the Variable is initialized with a truncated normal distribution.
+    A weight decay is added only if one is specified.
+    Args:
+      name: name of the variable
+      shape: list of ints
+      stddev: standard deviation of a truncated Gaussian
+      wd: add L2Loss weight decay multiplied by this float. If None, weight
+          decay is not added for this Variable.
+    Returns:
+      Variable Tensor
+    """
+    dtype = tf.float32
+    var = variable(
+        name,
+        shape,
+        initializer=tf.truncated_normal_initializer(
+            stddev=stddev,
+            dtype=dtype))
+
+    if wd is not None:
+        weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
+        tf.add_to_collection('losses', weight_decay)
+
+    return var
+
+
+def normalize_vector(v):
+    """
+    Takes in a vector in list form, concatenates it to form a single vector,
+    normalizes it to unit length, then returns it in list form together with its norm.
+    """
+    norm_val = np.linalg.norm(np.concatenate(v))
+    norm_v = [a / norm_val for a in v]
+    return norm_v, norm_val
+
+
+def operate_deep(op, var):
+    # if isinstance(var, list):
+    if type(var).__name__ == "list":
+        return [operate_deep(op, x) for x in var]
+    # if isinstance(var, tuple):
+    if type(var).__name__ == "tuple":
+        return tuple(operate_deep(op, x) for x in var)
+    return op(var)
+
+
+def operate_deep_2v(op, var1, var2):
+    if isinstance(var1, list):
+        return [operate_deep_2v(op, a, b) for a, b in zip(var1, var2)]
+    if isinstance(var1, tuple):
+        return tuple(operate_deep_2v(op, a, b) for a, b in zip(var1, var2))
+    return op(var1, var2)
+
diff --git a/aix360/data/sif_data/data.pkl b/aix360/data/sif_data/data.pkl
new file mode 100644
index 0000000..064920a
Binary files /dev/null and b/aix360/data/sif_data/data.pkl differ
diff --git a/aix360/datasets/SIF_dataset.py b/aix360/datasets/SIF_dataset.py
new file mode 100644
index 0000000..d780980
--- /dev/null
+++ b/aix360/datasets/SIF_dataset.py
@@ -0,0 +1,272 @@
+# Adapted from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/learn/python/learn/datasets/mnist.py
+import numpy as np
+from tensorflow.contrib.learn.python.learn.datasets.base import Datasets
+
+
+class DataSetTS():
+    def __init__(self,
+                 y:np.ndarray,
+                 y_name:np.ndarray = None,
+                 t: np.ndarray = None,
+                 x: np.ndarray = None,
+                 x_name:np.ndarray = None,
+                 split_ratio = (0.6, 0.2, 0.2),
+                 lag = 1,
+                 scaler = None,
+                 reference_ts = None
+                 ):
+
+        # check all the dimensions:
+        n_t = y.shape[0]
+        if len(y.shape) != 2:
+            y = np.reshape(y, (n_t, -1))
+
+        n_y = y.shape[1]
+
+        if y_name is not None:
+            assert y_name.shape[0] == n_y, "y_name must agree with the dimension of y"
+
+        if t is not None:
+            assert t == n_t, "t must be the same length with x"
+
+        if x is not None and len(x.shape) != 2:
+            x = np.reshape(x, [x.shape[0], -1])
+            assert x.shape[0] == n_t, "x must have the same length with x"
+
+            if x_name is not None:
+                x_name = np.reshape(x_name, -(1))
+
+            n_f = x.shape[0]
+            if x_name is not None:
+                assert x_name.shape[0] == n_f, "x_name must agree with the dimension of x"
+        else:
+            n_f = 0
+
+        # record all the information
+        self.x = x
+        self.x_name = x_name
+        self.y = y
+        self.y_name = y_name
+        self.t = t
+
+        self.n_t = n_t
+        self.n_f = n_f
+
+        self.split_ratio = split_ratio
+        self.lag = lag
+
+        self.scaler = scaler
+
+        self.split_train_test()
+        self.reference_ts = reference_ts
+
+    def moving_win_idx(self, init_win_len, horizon, moving_step, fixed_win, n_t):
+        stops = np.arange(init_win_len, n_t - horizon, moving_step)
+        if fixed_win:
+            starts = stops - init_win_len
+        else:
+            starts = np.repeat(0, stops.shape[0])
+        train = map(np.arange, starts, stops)
+        test = map(np.arange, stops, stops+horizon)
+        return train, test, stops-1
+
+    def split_train_test(self, split_ratio=None):
+        if split_ratio is None:
+            split_ratio = self.split_ratio
+        else:
+            self.split_ratio = split_ratio
+
+        hist_idx, pred_idx, t_idx = self.moving_win_idx(self.lag, 1, 1, True, self.n_t)
+
+        all_steps = list() # y_n, y_(0:n), x_n, t_n
+        for tr, te, tt in zip(hist_idx, pred_idx, t_idx):
+            all_steps.append((self.y[te, :], self.y[tr, :],
+                             self.x[tr, :] if self.x is not None else None,
+                             self.t[tt] if self.t is not None else tt))
+
+        n_train = int(self.n_t*split_ratio[0])
+
+        """
+        if self.batch_size is not None: # adjust the n_train and n_validate to fit the batch_size
+            n_train = round(n_train/self.batch_size) * self.batch_size
+            # n_valid = round(n_valid/self.batch_size) * self.batch_size
+            assert n_train > 0, "the batch_size is too large"
+            # assert n_valid > 0, "the batch_size is too large"
+        """
+        self.train = all_steps[:n_train]
+        self.test = all_steps[(n_train):]
+
+    def _split_y_x(self, dt):
+        y = np.vstack([s[0] for s in dt])
+        if self.x is None:
+            x = np.vstack([np.reshape(s[1], (1, s[1].shape[0], -1)) for s in dt])
+        else:
+            x = np.vstack([np.reshape(np.hstack((s[1], s[2])), (1, s[1].shape[0], -1)) for s in dt])
+        return tuple([x[:, i] for i in range(x.shape[1])]), y
+
+
+    def datasets_gen_rnn(self):
+        """
+        Generate a x, y pair ready to feed into RNN
+        :return:
+            x - a three dimensional array: sample x time steps x feature
+            y - a two dimensional array: sample x 1
+        """
+        train = DataSet(*(self._split_y_x(self.train)), reference_ts=self.reference_ts)
+        test = DataSet(*(self._split_y_x(self.test)), reference_ts=self.reference_ts)
+
+        return Datasets(train=train, validation=test, test=test)
+
+
+class DataSet(object):
+
+    def __init__(self, x, labels, reference_ts = None):
+        if isinstance(x, list) or isinstance(x, tuple):
+            if len(x[0].shape) > 2:
+                x = [np.reshape(i.astype(np.int), [i.shape[0], -1]) for i in x]
+            else:
+                x = [i.astype(np.int) for i in x]
+            assert (x[0].shape[0] == labels.shape[0])
+
+        else:
+            if len(x.shape) > 2:
+                x = np.reshape(x, [x.shape[0], -1])
+            assert (x[0].shape[0] == labels.shape[0])
+            x = x.astype(np.int)
+
+        self._num_examples = labels.shape[0]
+
+        self._x = x
+        self._labels = labels
+
+        # used for the output of index to make sure there is no nan goes into the calculation
+        self.reference_ts = reference_ts
+
+
+    @property
+    def x(self):
+        if self._nonnan_idx is None:
+            return self._x
+        if isinstance(self._x, list):
+            return [x[self._nonnan_idx] for x in self._x]
+        return self._x[self._nonnan_idx]
+
+    @property
+    def labels(self):
+        if self._nonnan_idx is None:
+            return self._labels
+        return self._labels[self._nonnan_idx]
+
+    @property
+    def reference_ts(self):
+        return self._reference_ts
+
+    @reference_ts.setter
+    def reference_ts(self, value):
+        if value is None:
+            self._reference_ts = None
+            self._nonnan_idx = None
+
+            if isinstance(self._x, list):
+                self._x_batch = [np.copy(ele) for ele in self.x]
+            else:
+                self._x_batch = np.copy(self.x)
+        else:
+            self._reference_ts = value
+            if isinstance(self._x, list):
+                if value is not None:  # check if reference ts contains nan
+                    self._nonnan_idx = np.sum(np.hstack([np.isnan(value[x[:, 0], ...]) for x in self._x]
+                                               + [np.isnan(value[self._labels[:, 0], ...])]), axis=1) == 0
+                    self._x_batch = [np.copy(ele) for ele in self.x]
+            else:
+                if value is not None:  # check if reference ts contains nan
+                    self._nonnan_idx = np.sum(np.hstack((np.isnan(value[self._x, ...]),
+                                                np.isnan(value[self._labels[:, 0], ...]))), axis=1) == 0
+                    self._x_batch = np.copy(self.x)
+
+        self._labels_batch = np.copy(self.labels)
+        self._index_in_epoch = 0
+
+    @property
+    def num_examples(self):
+        if self._nonnan_idx is None:
+            return self._num_examples
+        else:
+            return sum(self._nonnan_idx)
+
+    def reset_batch(self):
+        self._index_in_epoch = 0
+        if isinstance(self._x_batch, list):
+            self._x_batch = [np.copy(ele) for ele in self.x]
+        else:
+            self._x_batch = np.copy(self.x)
+
+        self._labels_batch = np.copy(self.labels)
+
+    def next_batch(self, batch_size):
+        assert batch_size <= self.num_examples
+
+        start = self._index_in_epoch
+        self._index_in_epoch += batch_size
+
+        if self._index_in_epoch > self.num_examples:
+
+            # Shuffle the data
+            perm = np.arange(self.num_examples)
+            np.random.shuffle(perm)
+            if isinstance(self._x_batch, list):
+                self._x_batch = [i[perm,:] for i in self._x_batch]
+            else:
+                self._x_batch = self._x_batch[perm, :]
+            self._labels_batch = self._labels_batch[perm, :]
+
+            # Start next epoch
+            start = 0
+            self._index_in_epoch = batch_size
+
+        end = self._index_in_epoch
+        if isinstance(self._x_batch, list):
+            xx = [ele[start:end] for ele in self._x_batch]
+            yy = self._labels_batch[start:end, :]
+
+        else:
+            xx = self._x_batch[start:end]
+            yy = self._labels_batch[start:end, :]
+        #assert not np.any(np.vstack(np.isnan([self.reference_ts[x] for x in xx]))), ""
+        #assert not np.any(np.vstack(np.isnan([self.reference_ts[x] for x in yy]))), ""
+        return xx, yy
+
+
+def filter_dataset(X, Y, pos_class, neg_class):
+    """
+    Filters out elements of X and Y that aren't one of pos_class or neg_class
+    then transforms labels of Y so that +1 = pos_class, -1 = neg_class.
+    """
+    assert(X.shape[0] == Y.shape[0])
+    assert(len(Y.shape) == 1)
+
+    Y = Y.astype(int)
+    
+    pos_idx = Y == pos_class
+    neg_idx = Y == neg_class        
+    Y[pos_idx] = 1
+    Y[neg_idx] = -1
+    idx_to_keep = pos_idx | neg_idx
+    X = X[idx_to_keep, ...]
+    Y = Y[idx_to_keep]    
+    return (X, Y)    
+
+
+def find_distances(target, X, theta=None):
+    assert len(X.shape) == 2, "X must be 2D, but it is currently %s" % len(X.shape)
+    target = np.reshape(target, -1)
+    assert X.shape[1] == len(target), \
+      "X (%s) and target (%s) must have same feature dimension" % (X.shape[1], len(target))
+    
+    if theta is None:
+        return np.linalg.norm(X - target, axis=1)
+    else:
+        theta = np.reshape(theta, -1)
+        
+        # Project onto theta
+        return np.abs((X - target).dot(theta))
diff --git a/aix360/models/sif/ar2.data-00000-of-00001 b/aix360/models/sif/ar2.data-00000-of-00001
new file mode 100644
index 0000000..a62ff65
Binary files /dev/null and b/aix360/models/sif/ar2.data-00000-of-00001 differ
diff --git a/aix360/models/sif/ar2.index b/aix360/models/sif/ar2.index
new file mode 100644
index 0000000..11a948e
Binary files /dev/null and b/aix360/models/sif/ar2.index differ
diff --git a/aix360/models/sif/ar2.meta b/aix360/models/sif/ar2.meta
new file mode 100644
index 0000000..be6d6f1
Binary files /dev/null and b/aix360/models/sif/ar2.meta differ
diff --git a/aix360/models/sif/checkpoint b/aix360/models/sif/checkpoint
new file mode 100644
index 0000000..3e47d19
--- /dev/null
+++ b/aix360/models/sif/checkpoint
@@ -0,0 +1,4 @@
+model_checkpoint_path: "ar2"
+all_model_checkpoint_paths: "..\\..\\..\\tests\\sif\\arma_output\\ar_test-checkpoint-9999"
+all_model_checkpoint_paths: "..\\..\\..\\tests\\sif\\arma_output\\ar_test-checkpoint-19999"
+all_model_checkpoint_paths: "ar2"
diff --git a/examples/sif/SIF.ipynb b/examples/sif/SIF.ipynb
new file mode 100644
index 0000000..559cecc
--- /dev/null
+++ b/examples/sif/SIF.ipynb
@@ -0,0 +1,348 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# SIF Explainer Tutorial\n",
+    "#### This tutorial illustrates the use of SIF algorithm, a comprehensive single-valued metric, to measure outlier impacts on future predictions. It provides a quantitative measure regarding the outlier impacts,  which can be used in a variety of scenarios, such as the evaluation of outlier detection methods, the creation of more harmful outliers, etc. This is a demonstration for AAAI-2021 paper [Outlier Impact Characterization for Time Series Data](https://ojs.aaai.org/index.php/AAAI/article/view/17379).\n",
+    "#### SIF algorithm mainly tackles the challenge of outlier interpretation in time series data via [contamination processes](https://www.jstor.org/stable/pdf/3035535.pdf). It assumes that the observed input time series is obtained from separate processes for both the core input and the recurring outliers, i.e., the core process and the contaminating process. At each time stamp, with a certain (small) probability, the observed value of the contaminated process comes from the contaminating process, which corresponds to the outliers. The SIF algorithm focuses on the generic patchy outliers where the outlying patterns can be present over consecutive time stamps, and aims to study the impact of the contaminating process on both parameter estimation and future value prediction.\n",
+    "#### The tutorial is organized as folows:\n",
+    "#### 1. [Train a Model Inherited from SIFExplainer](https://github.com/Leo02016/AIX360/blob/dc0efab88b90f225427347b080897a3e19792403/examples/sif/SIF.ipynb#L12)\n",
+    "#### 2. [Initialize the Required Parameters and Synthesize the Dataset](https://github.com/Leo02016/AIX360/blob/dc0efab88b90f225427347b080897a3e19792403/examples/sif/SIF.ipynb#L61)\n",
+    "#### 3. [Interpret the Model by SIF Value](https://github.com/Leo02016/AIX360/blob/dc0efab88b90f225427347b080897a3e19792403/examples/sif/SIF.ipynb#L139)"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## 1. Train a Model Inherited from SIFExplainer\n",
+    "#### First, we define a function that trains a AR2 model inherited from SIFExplainer and this model (AR2) is what we are going to interpret. Notice that the AR2 model below can be changed to AllLSTM or AllRNN model defined in SIF_NN.py file. AllRNN, AllLSTM, AllAR are also inherited from SIFExplainer Model defined in SIF.py file."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "outputs": [
+    {
+     "name": "stderr",
+     "text": [
+      "Using TensorFlow backend.\n"
+     ],
+     "output_type": "stream"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import statsmodels.tsa.api as smt\n",
+    "import matplotlib.pyplot as plt\n",
+    "from aix360.datasets.SIF_dataset import DataSetTS\n",
+    "from aix360.algorithms.sif.SIF_NN import AllRNN, AllLSTM, AllAR\n",
+    "from aix360.algorithms.sif.SIF_utils import get_contaminate_series\n",
+    "import os\n",
+    "import pickle\n",
+    "\n",
+    "# arma2 model\n",
+    "def get_model_train_ar2(data_sets, series, timesteps, w=None, gammas=None, num_train_steps=20000,\n",
+    "                        model_dir=None):\n",
+    "    initial_learning_rate = 0.01\n",
+    "    decay_epochs = [10000, 20000]\n",
+    "    batch_size = 20\n",
+    "    n_sample = series.shape[1] if len(series.shape) > 1 else 1\n",
+    "\n",
+    "    # model can be changed to AllLSTM or AllRNN model defined in SIF_NN.py\n",
+    "    model = AllAR(\n",
+    "        time_steps=timesteps,\n",
+    "        x_dim=n_sample,\n",
+    "        y_dim=n_sample,\n",
+    "        share_param=True,\n",
+    "        batch_size=batch_size,\n",
+    "        time_series=series,\n",
+    "        data_sets=data_sets,\n",
+    "        initial_learning_rate=initial_learning_rate,\n",
+    "        damping=1e-3,\n",
+    "        decay_epochs=decay_epochs,\n",
+    "        mini_batch=False,\n",
+    "        train_dir='arma_output',\n",
+    "        log_dir='logs',\n",
+    "        model_name='ar_test',\n",
+    "        calc_hessian=False,\n",
+    "        w=w,\n",
+    "        gammas=gammas,\n",
+    "    )\n",
+    "    if model_dir is not None:\n",
+    "        print('Loading pre-trained model......')\n",
+    "        model.restore(model_dir)\n",
+    "    else:\n",
+    "        model.train(num_steps=num_train_steps, iter_to_switch_to_batch=10, iter_to_switch_to_sgd=10)\n",
+    "    return model"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## 2. Initialize the Required Parameters and Synthesize the Dataset\n",
+    "#### In the second step, we initialize the parameters for SIFExplainer. \n",
+    "#### By default, we use the fast mode to accelerate the computation. In the fast mode, we load the pre-trained AR2 model and dataset to skip training stage. If the user plans to retrain the AR2 model, please set the parameter [fast_mode](https://github.com/Leo02016/AIX360/blob/dc0efab88b90f225427347b080897a3e19792403/examples/sif/SIF.ipynb#L65) to be False. Then, the algorithm will generate a new synthetic dataset and train AR2 model from scratch.\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "outputs": [],
+   "source": [
+    "fast_mode = True\n",
+    "\n",
+    "# parameters\n",
+    "timesteps = 2\n",
+    "np.random.seed(1)\n",
+    "n_sample = 1000\n",
+    "n_time_stamp = 100\n",
+    "gammas = np.arange(0.0, 0.09, 0.01)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "#### Please ensure that data.pkl and model are downloaded in the following directories."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "outputs": [],
+   "source": [
+    "data_dir = '../../aix360/data/sif_data/data.pkl'\n",
+    "model_dir = '../../aix360/models/sif/ar2'\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "#### Skip generating the synthetic dataset and training the model if using fast-mode. In the fast mode, we directly load the saved dataset and pre-trained model"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "outputs": [
+    {
+     "name": "stdout",
+     "text": [
+      "WARNING:tensorflow:From C:\\Users\\lecheng4\\Documents\\GitHub\\AIX360\\aix360\\algorithms\\sif\\SIF_NN.py:18: The name tf.random_normal is deprecated. Please use tf.random.normal instead.\n\n",
+      "WARNING:tensorflow:From C:\\Users\\lecheng4\\Documents\\GitHub\\AIX360\\aix360\\algorithms\\sif\\SIF.py:29: The name tf.set_random_seed is deprecated. Please use tf.compat.v1.set_random_seed instead.\n\n",
+      "WARNING:tensorflow:From C:\\Users\\lecheng4\\Documents\\GitHub\\AIX360\\aix360\\algorithms\\sif\\SIF_NN.py:38: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.\n\n",
+      "WARNING:tensorflow:From C:\\Users\\lecheng4\\Documents\\GitHub\\AIX360\\aix360\\algorithms\\sif\\SIF.py:649: The name tf.squared_difference is deprecated. Please use tf.math.squared_difference instead.\n\n",
+      "WARNING:tensorflow:From C:\\Users\\lecheng4\\Documents\\GitHub\\AIX360\\aix360\\algorithms\\sif\\SIF.py:654: The name tf.is_nan is deprecated. Please use tf.math.is_nan instead.\n\n",
+      "WARNING:tensorflow:From C:\\Users\\lecheng4\\Anaconda3\\envs\\tf_115\\lib\\site-packages\\tensorflow_core\\python\\ops\\array_ops.py:1475: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\nInstructions for updating:\nUse tf.where in 2.0, which has the same broadcast rule as np.where\n",
+      "WARNING:tensorflow:From C:\\Users\\lecheng4\\Documents\\GitHub\\AIX360\\aix360\\algorithms\\sif\\SIF.py:657: The name tf.add_to_collection is deprecated. Please use tf.compat.v1.add_to_collection instead.\n\n",
+      "WARNING:tensorflow:From C:\\Users\\lecheng4\\Documents\\GitHub\\AIX360\\aix360\\algorithms\\sif\\SIF.py:659: The name tf.get_collection is deprecated. Please use tf.compat.v1.get_collection instead.\n\n",
+      "WARNING:tensorflow:From C:\\Users\\lecheng4\\Documents\\GitHub\\AIX360\\aix360\\algorithms\\sif\\SIF.py:96: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.\n\n",
+      "WARNING:tensorflow:From C:\\Users\\lecheng4\\Documents\\GitHub\\AIX360\\aix360\\algorithms\\sif\\SIF.py:614: The name tf.train.AdamOptimizer is deprecated. Please use tf.compat.v1.train.AdamOptimizer instead.\n\n",
+      "WARNING:tensorflow:From C:\\Users\\lecheng4\\Documents\\GitHub\\AIX360\\aix360\\algorithms\\sif\\SIF.py:625: The name tf.train.GradientDescentOptimizer is deprecated. Please use tf.compat.v1.train.GradientDescentOptimizer instead.\n\n",
+      "WARNING:tensorflow:From C:\\Users\\lecheng4\\Documents\\GitHub\\AIX360\\aix360\\algorithms\\sif\\SIF.py:104: The name tf.train.Saver is deprecated. Please use tf.compat.v1.train.Saver instead.\n\n",
+      "WARNING:tensorflow:From C:\\Users\\lecheng4\\Documents\\GitHub\\AIX360\\aix360\\algorithms\\sif\\SIF.py:161: The name tf.global_variables_initializer is deprecated. Please use tf.compat.v1.global_variables_initializer instead.\n\n",
+      "Total number of parameters: 2\nLoading pre-trained model......\nINFO:tensorflow:Restoring parameters from ../../aix360/models/sif/ar2\n"
+     ],
+     "output_type": "stream"
+    }
+   ],
+   "source": [
+    "if fast_mode:\n",
+    "    assert os.path.exists(data_dir), \"Could not find the data.pkl in {}\".format(data_dir)\n",
+    "    # load time series from data.pkl file\n",
+    "    series = pickle.load(open(data_dir, \"rb\"))\n",
+    "    data_sets = DataSetTS(np.arange(len(series)), np.array(['Y']), None, None, None,\n",
+    "                          lag=timesteps).datasets_gen_rnn()\n",
+    "    # initialize and train the model which takes the clean time sequence as input and makes prediction\n",
+    "    model = get_model_train_ar2(data_sets, series, timesteps, gammas=gammas, model_dir=model_dir)\n",
+    "else:\n",
+    "    # ar and ma are two parameters controlling the synthetic time sequence data\n",
+    "    ar = np.r_[1, -np.array([0.7, -0.3])]\n",
+    "    ma = np.r_[1, -np.array([0.1])]\n",
+    "\n",
+    "    # generate the core process or clean time sequence data\n",
+    "    series = [smt.arma_generate_sample(ar, ma, n_time_stamp) for i in range(n_sample)]\n",
+    "    series = np.vstack(series)\n",
+    "    pickle.dump(series, open(data_dir, \"wb\"))\n",
+    "    data_sets = DataSetTS(np.arange(len(series)), np.array(['Y']), None, None, None,\n",
+    "                          lag=timesteps).datasets_gen_rnn()\n",
+    "    # initialize and train the model which takes the clean time sequence as input and makes prediction\n",
+    "    model = get_model_train_ar2(data_sets, series, timesteps, gammas=gammas, model_dir=None)\n",
+    "    model.save(model_dir)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "#### y_contaminate is considered as the contaminating process in the experiment"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "outputs": [],
+   "source": [
+    "y_contaminate = np.random.randn(n_sample)\n",
+    "model.update_configure(y_contaminate, gammas)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "#### Insert the contaminated data into the clean time sequence data"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "outputs": [],
+   "source": [
+    "contaminated_series = get_contaminate_series(series, y_contaminate, data_sets.train.labels)\n",
+    "# plot contaminated series\n",
+    "# plt.plot(contaminated_series)\n",
+    "# plt.show()\n",
+    "# plt.close()\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## 3. Interpret the Model by SIF Value\n",
+    "#### In the last step, we compute SIF value. Notice that the model is inherited from SIFExplainer and the user could simply call model.explain_instance() function to get the SIF value."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "outputs": [
+    {
+     "name": "stdout",
+     "text": [
+      "Regression R-squared: 0.179848\n162.92198777198792 s to compute if_v\n0.03913569450378418 s to compute patchy pred gamma\n0.0718080997467041 s to compute psi_y\nSIF = -119.54141353277345\n"
+     ],
+     "output_type": "stream"
+    }
+   ],
+   "source": [
+    "# compute SIF value\n",
+    "sif = model.explain_instance(y_contaminate, timesteps, 1, None, 30, verbose=False)\n",
+    "print('SIF = {}'.format(sif))\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "name": "python3",
+   "language": "python",
+   "display_name": "Python 3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/requirement.txt b/requirement.txt
new file mode 100644
index 0000000..de93cc0
--- /dev/null
+++ b/requirement.txt
@@ -0,0 +1,2 @@
+statsmodels==0.11.1
+matplotlib==3.3.1
\ No newline at end of file
diff --git a/sif_environment.yml b/sif_environment.yml
new file mode 100644
index 0000000..e9c7174
--- /dev/null
+++ b/sif_environment.yml
@@ -0,0 +1,200 @@
+name: tf_115
+channels:
+  - pytorch
+  - defaults
+dependencies:
+  - _tflow_select=2.1.0=gpu
+  - absl-py=0.9.0=py37_0
+  - argon2-cffi=20.1.0=py37he774522_1
+  - astor=0.8.1=py37_0
+  - async_generator=1.10=py37h28b3542_0
+  - attrs=20.2.0=py_0
+  - backcall=0.2.0=py_0
+  - blas=1.0=mkl
+  - bleach=3.2.1=py_0
+  - blinker=1.4=py37_0
+  - brotlipy=0.7.0=py37he774522_1000
+  - ca-certificates=2020.10.14=0
+  - cachetools=4.1.1=py_0
+  - certifi=2020.12.5=py37haa95532_0
+  - cffi=1.14.2=py37h7a1dbc1_0
+  - chardet=3.0.4=py37_1003
+  - click=7.1.2=py_0
+  - cloudpickle=1.6.0=py_0
+  - colorama=0.4.3=py_0
+  - cryptography=3.1=py37h7a1dbc1_0
+  - cudatoolkit=10.0.130=0
+  - cudnn=7.6.5=cuda10.0_0
+  - cycler=0.10.0=py37_0
+  - cython=0.29.21=py37ha925a31_0
+  - cytoolz=0.11.0=py37he774522_0
+  - dask-core=2.30.0=py_0
+  - decorator=4.4.2=py_0
+  - defusedxml=0.6.0=py_0
+  - docopt=0.6.2=py37_0
+  - entrypoints=0.3=py37_0
+  - et_xmlfile=1.0.1=py_1001
+  - freetype=2.10.2=hd328e21_0
+  - gast=0.2.2=py37_0
+  - google-auth=1.21.0=py_0
+  - google-auth-oauthlib=0.4.1=py_2
+  - google-pasta=0.2.0=py_0
+  - grpcio=1.31.0=py37he7da953_0
+  - h5py=2.10.0=py37h5e291fa_0
+  - hdf5=1.10.4=h7ebc959_0
+  - icc_rt=2019.0.0=h0cc432a_1
+  - icu=58.2=ha925a31_3
+  - idna=2.10=py_0
+  - imageio=2.9.0=py_0
+  - importlib-metadata=1.7.0=py37_0
+  - importlib_metadata=1.7.0=0
+  - intel-openmp=2020.2=254
+  - ipykernel=5.3.4=py37h5ca1d4c_0
+  - ipython=7.18.1=py37h5ca1d4c_0
+  - ipython_genutils=0.2.0=py37_0
+  - ipywidgets=7.5.1=py_1
+  - jdcal=1.4.1=py_0
+  - jedi=0.17.2=py37_0
+  - jinja2=2.11.2=py_0
+  - joblib=0.16.0=py_0
+  - jpeg=9b=hb83a4c4_2
+  - jsonschema=3.2.0=py_2
+  - jupyter=1.0.0=py37_7
+  - jupyter_client=6.1.7=py_0
+  - jupyter_console=6.2.0=py_0
+  - jupyter_core=4.6.3=py37_0
+  - jupyterlab_pygments=0.1.2=py_0
+  - keras-applications=1.0.8=py_1
+  - keras-preprocessing=1.1.0=py_1
+  - kiwisolver=1.2.0=py37h74a9793_0
+  - libpng=1.6.37=h2a8f88b_0
+  - libprotobuf=3.13.0=h200bbdf_0
+  - libsodium=1.0.18=h62dcd97_0
+  - libtiff=4.1.0=h56a325e_1
+  - lz4-c=1.9.2=h62dcd97_1
+  - m2w64-gcc-libgfortran=5.3.0=6
+  - m2w64-gcc-libs=5.3.0=7
+  - m2w64-gcc-libs-core=5.3.0=7
+  - m2w64-gmp=6.1.0=2
+  - m2w64-libwinpthread-git=5.0.0.4634.697f757=2
+  - markdown=3.2.2=py37_0
+  - markupsafe=1.1.1=py37hfa6e2cd_1
+  - matplotlib=3.3.1=0
+  - matplotlib-base=3.3.1=py37hba9282a_0
+  - mistune=0.8.4=py37hfa6e2cd_1001
+  - mkl=2020.2=256
+  - mkl-service=2.3.0=py37hb782905_0
+  - mkl_fft=1.1.0=py37h45dec08_0
+  - mkl_random=1.1.1=py37h47e9c7a_0
+  - msys2-conda-epoch=20160418=1
+  - nbclient=0.5.0=py_0
+  - nbconvert=6.0.7=py37_0
+  - nbformat=5.0.7=py_0
+  - nest-asyncio=1.4.1=py_0
+  - networkx=2.5=py_0
+  - ninja=1.10.1=py37h7ef1ec2_0
+  - notebook=6.1.4=py37_0
+  - numpy=1.19.1=py37h5510c5b_0
+  - numpy-base=1.19.1=py37ha3acd2a_0
+  - oauthlib=3.1.0=py_0
+  - olefile=0.46=py37_0
+  - openpyxl=3.0.5=py_0
+  - openssl=1.1.1h=he774522_0
+  - opt_einsum=3.1.0=py_0
+  - packaging=20.4=py_0
+  - pandas=1.1.1=py37ha925a31_0
+  - pandoc=2.11=h9490d1a_0
+  - pandocfilters=1.4.2=py37_1
+  - parso=0.7.0=py_0
+  - patsy=0.5.1=py37_0
+  - pickleshare=0.7.5=py37_1001
+  - pillow=7.2.0=py37hcc1f983_0
+  - pip=20.2.2=py37_0
+  - prometheus_client=0.8.0=py_0
+  - prompt-toolkit=3.0.8=py_0
+  - prompt_toolkit=3.0.8=0
+  - protobuf=3.13.0=py37h6538335_0
+  - pyasn1=0.4.8=py_0
+  - pyasn1-modules=0.2.7=py_0
+  - pycparser=2.20=py_2
+  - pygments=2.7.1=py_0
+  - pyjwt=1.7.1=py37_0
+  - pyopenssl=19.1.0=py_1
+  - pyparsing=2.4.7=py_0
+  - pyqt=5.9.2=py37h6538335_2
+  - pyreadline=2.1=py37_1
+  - pyrsistent=0.17.3=py37he774522_0
+  - pysocks=1.7.1=py37_1
+  - python=3.7.9=h60c2a47_0
+  - python-dateutil=2.8.1=py_0
+  - pytorch=1.2.0=py3.7_cuda100_cudnn7_1
+  - pytz=2020.1=py_0
+  - pywavelets=1.1.1=py37he774522_2
+  - pywin32=227=py37he774522_1
+  - pywinpty=0.5.7=py37_0
+  - pyzmq=19.0.2=py37ha925a31_1
+  - qt=5.9.7=vc14h73c81de_0
+  - qtconsole=4.7.7=py_0
+  - qtpy=1.9.0=py_0
+  - requests=2.24.0=py_0
+  - requests-oauthlib=1.3.0=py_0
+  - rsa=4.6=py_0
+  - scikit-image=0.16.2=py37h47e9c7a_0
+  - scikit-learn=0.23.2=py37h47e9c7a_0
+  - scipy=1.5.2=py37h9439919_0
+  - seaborn=0.10.1=py_0
+  - send2trash=1.5.0=py37_0
+  - setuptools=49.6.0=py37_0
+  - sip=4.19.8=py37h6538335_0
+  - six=1.15.0=py_0
+  - sqlite=3.33.0=h2a8f88b_0
+  - statsmodels=0.11.1=py37he774522_0
+  - tensorboard=2.2.1=pyh532a8cf_0
+  - tensorboard-plugin-wit=1.6.0=py_0
+  - tensorflow=1.15.0=gpu_py37hc3743a6_0
+  - tensorflow-base=1.15.0=gpu_py37h1afeea4_0
+  - tensorflow-estimator=1.15.1=pyh2649769_0
+  - tensorflow-gpu=1.15.0=h0d30ee6_0
+  - termcolor=1.1.0=py37_1
+  - terminado=0.9.1=py37_0
+  - testpath=0.4.4=py_0
+  - threadpoolctl=2.1.0=pyh5ca1d4c_0
+  - tk=8.6.10=he774522_0
+  - toolz=0.11.1=py_0
+  - torchvision=0.4.0=py37_cu100
+  - tornado=6.0.4=py37he774522_1
+  - traitlets=5.0.4=py_0
+  - urllib3=1.25.10=py_0
+  - vc=14.1=h0510ff6_4
+  - vs2015_runtime=14.16.27012=hf0eaf9b_3
+  - wcwidth=0.2.5=py_0
+  - webencodings=0.5.1=py37_1
+  - werkzeug=0.16.1=py_0
+  - wheel=0.35.1=py_0
+  - widgetsnbextension=3.5.1=py37_0
+  - win_inet_pton=1.1.0=py37_0
+  - wincertstore=0.2=py37_0
+  - winpty=0.4.3=4
+  - wrapt=1.12.1=py37he774522_1
+  - xz=5.2.5=h62dcd97_0
+  - yaml=0.2.5=he774522_0
+  - zeromq=4.3.2=ha925a31_3
+  - zipp=3.1.0=py_0
+  - zlib=1.2.11=h62dcd97_4
+  - zstd=1.4.5=h04227a9_0
+  - pip:
+    - args==0.1.0
+    - clint==0.5.1
+    - coc==0.2
+    - coco==0.4.0
+    - dill==0.3.2
+    - imgaug==0.4.0
+    - imutils==0.5.3
+    - opencv-python==4.4.0.44
+    - pycocotools-windows==2.0.0.2
+    - python-crontab==2.5.1
+    - pyyaml==5.3.1
+    - shapely==1.7.1
+    - tablib==2.0.0
+prefix: C:\Users\ibm\Anaconda3\envs\tf_115
+
diff --git a/tests/sif/test_SIF.py b/tests/sif/test_SIF.py
new file mode 100644
index 0000000..36bc0c6
--- /dev/null
+++ b/tests/sif/test_SIF.py
@@ -0,0 +1,107 @@
+import numpy as np
+import statsmodels.tsa.api as smt
+import matplotlib.pyplot as plt
+from aix360.datasets.SIF_dataset import DataSetTS
+from aix360.algorithms.sif.SIF_NN import AllRNN, AllLSTM, AllAR
+from aix360.algorithms.sif.SIF_utils import get_contaminate_series
+import unittest
+import argparse
+import os
+import pickle
+
+
+class SIF(unittest.TestCase):
+    def get_model_train_ar2(self, data_sets, series, timesteps, w=None, gammas=None, num_train_steps=20000,
+                            model_dir=None):
+        initial_learning_rate = 0.01
+        decay_epochs = [10000, 20000]
+        batch_size = 20
+        n_sample = series.shape[1] if len(series.shape) > 1 else 1
+
+        # model can be changed to AllLSTM or AllRNN model defined in SIF_NN.py
+        model = AllAR(
+            time_steps=timesteps,
+            x_dim=n_sample,
+            y_dim=n_sample,
+            share_param=True,
+            batch_size=batch_size,
+            time_series=series,
+            data_sets=data_sets,
+            initial_learning_rate=initial_learning_rate,
+            damping=1e-3,
+            decay_epochs=decay_epochs,
+            mini_batch=False,
+            train_dir='arma_output',
+            log_dir='logs',
+            model_name='ar_test',
+            calc_hessian=False,
+            w=w,
+            gammas=gammas,
+        )
+        if model_dir is not None:
+            print('Loading pre-trained model......')
+            model.restore(model_dir)
+        else:
+            model.train(num_steps=num_train_steps, iter_to_switch_to_batch=10, iter_to_switch_to_sgd=10)
+        return model
+
+    def test_SIF(self):
+        parser = argparse.ArgumentParser(description='RGCN')
+        parser.add_argument('-fast_mode', dest='fast_mode', action='store_true')
+        parser.set_defaults(fast_mode=True)
+        args = parser.parse_args()
+
+        # parameters
+        timesteps = 2
+        np.random.seed(1)
+        n_sample = 1000
+        n_time_stamp = 100
+        gammas = np.arange(0.0, 0.09, 0.01)
+        data_dir = '../../aix360/data/sif_data/data.pkl'
+        model_dir = '../../aix360/models/sif/ar2'
+
+        # Skip generating the synthetic dataset and training the model.
+        # In the fast mode, we directly load the saved dataset and pre-trained model
+        if args.fast_mode:
+            assert os.path.exists(data_dir), "Could not find the data.pkl in {}".format(data_dir)
+            # load time series from data.pkl file
+            series = pickle.load(open(data_dir, "rb"))
+            data_sets = DataSetTS(np.arange(len(series)), np.array(['Y']), None, None, None,
+                                  lag=timesteps).datasets_gen_rnn()
+            # initialize and train the model which takes the clean time sequence as input and makes prediction
+            model = self.get_model_train_ar2(data_sets, series, timesteps, gammas=gammas, model_dir=model_dir)
+        else:
+            # ar and ma are two parameters controlling the synthetic time sequence data
+            ar = np.r_[1, -np.array([0.7, -0.3])]
+            ma = np.r_[1, -np.array([0.1])]
+
+            # generate the core process or clean time sequence data
+            series = [smt.arma_generate_sample(ar, ma, n_time_stamp) for i in range(n_sample)]
+            series = np.vstack(series)
+            pickle.dump(series, open(data_dir, "wb"))
+            data_sets = DataSetTS(np.arange(len(series)), np.array(['Y']), None, None, None,
+                                  lag=timesteps).datasets_gen_rnn()
+            # initialize and train the model which takes the clean time sequence as input and makes prediction
+            model = self.get_model_train_ar2(data_sets, series, timesteps, gammas=gammas, model_dir=None)
+            model.save(model_dir)
+
+        # generate the contaminating process
+        y_contaminate = np.random.randn(n_sample)
+
+        # insert the contaminated data into the clean time sequence data
+        contaminated_series = get_contaminate_series(series, y_contaminate, data_sets.train.labels)
+
+        # plot contaminated series
+        plt.plot(contaminated_series)
+        plt.savefig('arma')
+        plt.close()
+
+        # compute SIF value
+        model.update_configure(y_contaminate, gammas)
+        # sif = model.get_sif(y_contaminate, timesteps, 1, None, 30, verbose=False)
+        sif = model.explain_instance(y_contaminate, timesteps, 1, None, 30, verbose=False)
+        print('SIF = {}'.format(sif))
+
+
+if __name__ == '__main__':
+    unittest.main()