From 588c323d402b778bc03acafb20dd4b7af56a6687 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <tim@gruber-schaefer.de>
Date: Mon, 9 Aug 2021 10:37:06 +0200
Subject: [PATCH 01/17] [GIT] add PyCharm files to gitignore

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 22c1ad65..8e898fb3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,10 @@ docs/_build/
 # Jupyter Notebook
 .ipynb_checkpoints
 
+# PyCharm
+.idea/
+.coverage
+
 # Distribution / packaging
 .Python
 env/

From c0846913c4d3a3c4930beeb4b57582c921ef33ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <tim@gruber-schaefer.de>
Date: Mon, 9 Aug 2021 14:20:15 +0200
Subject: [PATCH 02/17] [ADD] Tolstoi Char RNN testproblem

---
 deepobs/pytorch/testproblems/__init__.py      |  1 +
 .../pytorch/testproblems/tolstoi_char_rnn.py  | 33 +++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 deepobs/pytorch/testproblems/tolstoi_char_rnn.py

diff --git a/deepobs/pytorch/testproblems/__init__.py b/deepobs/pytorch/testproblems/__init__.py
index 3b8bd446..9cc5ac2a 100644
--- a/deepobs/pytorch/testproblems/__init__.py
+++ b/deepobs/pytorch/testproblems/__init__.py
@@ -21,3 +21,4 @@
 from .svhn_3c3d import svhn_3c3d
 from .svhn_wrn164 import svhn_wrn164
 from .testproblem import TestProblem
+from .tolstoi_char_rnn import tolstoi_char_rnn
diff --git a/deepobs/pytorch/testproblems/tolstoi_char_rnn.py b/deepobs/pytorch/testproblems/tolstoi_char_rnn.py
new file mode 100644
index 00000000..68cd8ad9
--- /dev/null
+++ b/deepobs/pytorch/testproblems/tolstoi_char_rnn.py
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+"""A vanilla RNN architecture for Tolstoi."""
+from torch import nn
+
+from deepobs.pytorch.testproblems.testproblem import WeightRegularizedTestproblem
+from .testproblems_modules import net_char_rnn
+from ..datasets.tolstoi import tolstoi
+
+
+class tolstoi_char_rnn(WeightRegularizedTestproblem):
+    """DeepOBS test problem class for char_rnn network on Tolstoi.
+
+    TODO: add some more details how the test problem works
+    """
+    def __init__(self, batch_size, l2_reg=0.0005):
+        """Create a new char_rnn test problem instance on Tolstoi.
+
+        Args:
+          batch_size (int): Batch size to use.
+          l2_reg (float): L2-regularization factor. L2-Regularization (weight decay)
+              is used on the weights but not the biases.
+              Defaults to ``5e-4``.
+        """
+        print(f"batch_size={batch_size}")
+        super(tolstoi_char_rnn, self).__init__(batch_size, l2_reg)
+
+    def set_up(self):
+        """Set up the Char RNN test problem on Tolstoi."""
+        self.data = tolstoi(self._batch_size)
+        self.loss_function = nn.CrossEntropyLoss
+        self.net = net_char_rnn(hidden_dim=10, num_layers=2, seq_len=50, vocab_size=100)
+        self.net.to(self._device)
+        self.regularization_groups = self.get_regularization_groups()

From e5db1afb3448a2bb8e4b817f7a90ce60fd410656 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <tim@gruber-schaefer.de>
Date: Mon, 9 Aug 2021 14:21:41 +0200
Subject: [PATCH 03/17] [FIX] Tolstoi dataset

---
 deepobs/pytorch/datasets/tolstoi.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/deepobs/pytorch/datasets/tolstoi.py b/deepobs/pytorch/datasets/tolstoi.py
index 562b104a..f577809a 100644
--- a/deepobs/pytorch/datasets/tolstoi.py
+++ b/deepobs/pytorch/datasets/tolstoi.py
@@ -7,8 +7,8 @@
 import torch
 from torch.utils import data as dat
 
-from .. import config
 from . import dataset
+from ...config import get_data_dir
 
 
 class tolstoi(dataset.DataSet):
@@ -43,7 +43,7 @@ def __init__(self, batch_size, seq_length=50, train_eval_size=653237):
         self._train_eval_size = train_eval_size
         super(tolstoi, self).__init__(batch_size)
 
-    def _make_dataloader(self, filepath):
+    def _make_tolstoi_dataloader(self, filepath):
         # Load the array of character ids, determine the number of batches that
         # can be produced, given batch size and sequence lengh
         arr = np.load(filepath)
@@ -79,8 +79,8 @@ def _make_dataloader(self, filepath):
         return dataset
 
     def _make_train_dataloader(self):
-        filepath = os.path.join(config.get_data_dir(), "tolstoi", "train.npy")
-        return self._make_dataloader(filepath)
+        filepath = os.path.join(get_data_dir(), "tolstoi", "train.npy")
+        return self._make_tolstoi_dataloader(filepath)
 
     def _make_train_eval_dataloader(self):
         indices = np.arange(
@@ -90,5 +90,13 @@ def _make_train_eval_dataloader(self):
         return dat.TensorDataset(train_eval_set[0], train_eval_set[1])
 
     def _make_test_dataloader(self):
-        filepath = os.path.join(config.get_data_dir(), "tolstoi", "test.npy")
-        return self._make_dataloader(filepath)
+        filepath = os.path.join(get_data_dir(), "tolstoi", "test.npy")
+        return self._make_tolstoi_dataloader(filepath)
+
+    def _make_train_and_valid_dataloader(self):
+        # TODO check whether this is intended usage
+        """return self._make_train_and_valid_dataloader_helper(
+            self._make_train_dataloader(),
+            self._make_train_dataloader(),
+        )"""
+        return self._make_train_dataloader(), self._make_train_dataloader()

From 6701e1a75ac366a9b282c8e729ba2534b38d6f17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <tim@gruber-schaefer.de>
Date: Mon, 9 Aug 2021 14:22:36 +0200
Subject: [PATCH 04/17] [ADD] net_char_rnn: debug with print

---
 deepobs/pytorch/testproblems/testproblems_modules.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/deepobs/pytorch/testproblems/testproblems_modules.py b/deepobs/pytorch/testproblems/testproblems_modules.py
index 682284bd..84387c63 100644
--- a/deepobs/pytorch/testproblems/testproblems_modules.py
+++ b/deepobs/pytorch/testproblems/testproblems_modules.py
@@ -725,14 +725,18 @@ def __init__(self, seq_len, hidden_dim, vocab_size, num_layers):
 
     def forward(self, x, state=None):
         """state is a tuple for hidden and cell state for initialisation of the lstm"""
+        print("net_char_rnn:forward()")
+        print(f"x.shape = {x.shape}")
+        # print(f"x={x}")
         x = self.embedding(x)
         # if no state is provided, default the state to zeros
         if state is None:
             x, new_state = self.lstm(x)
         else:
             x, new_state = self.lstm(x, state)
-        x = self.dense(x)
-        return x, new_state
+        output = self.dense(x)
+        print(f"output.shape={output.shape}")
+        return output   # , new_state
 
 
 class net_quadratic_deep(nn.Sequential):

From a7e407160963cd51889579b8f32d8797b6fef8c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <tim@gruber-schaefer.de>
Date: Mon, 16 Aug 2021 11:23:34 +0200
Subject: [PATCH 05/17] [ADD] add TODO, fix parameters

---
 deepobs/pytorch/testproblems/tolstoi_char_rnn.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/deepobs/pytorch/testproblems/tolstoi_char_rnn.py b/deepobs/pytorch/testproblems/tolstoi_char_rnn.py
index 68cd8ad9..b104ef67 100644
--- a/deepobs/pytorch/testproblems/tolstoi_char_rnn.py
+++ b/deepobs/pytorch/testproblems/tolstoi_char_rnn.py
@@ -12,6 +12,17 @@ class tolstoi_char_rnn(WeightRegularizedTestproblem):
 
     TODO: add some more details how the test problem works
     """
+
+    # TODO check differences compared to tensorflow
+    # - lstm layer has two bias -> "_check_parameters()"
+    # - loss function:
+    #   - tensorflow: mean across time, sum across batch
+    #   - pytorch: mean across all
+    # - lstm parameters counted separately (weight_{ih|hh}_l{i})
+    # - dropout layers
+    #   - tensorflow: dropout before and after each layer with keep=0.8
+    #   - pytorch: dropout in-between LSTM + dropout before and after LSTM
+
     def __init__(self, batch_size, l2_reg=0.0005):
         """Create a new char_rnn test problem instance on Tolstoi.
 
@@ -28,6 +39,6 @@ def set_up(self):
         """Set up the Char RNN test problem on Tolstoi."""
         self.data = tolstoi(self._batch_size)
         self.loss_function = nn.CrossEntropyLoss
-        self.net = net_char_rnn(hidden_dim=10, num_layers=2, seq_len=50, vocab_size=100)
+        self.net = net_char_rnn(hidden_dim=128, num_layers=2, seq_len=50, vocab_size=83)
         self.net.to(self._device)
         self.regularization_groups = self.get_regularization_groups()

From 8970099df314c8e3bec5d1f610f9e18ff61c98bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <tim@gruber-schaefer.de>
Date: Mon, 16 Aug 2021 11:24:22 +0200
Subject: [PATCH 06/17] [ADD] fix network, remove print

---
 .../pytorch/testproblems/testproblems_modules.py  | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/deepobs/pytorch/testproblems/testproblems_modules.py b/deepobs/pytorch/testproblems/testproblems_modules.py
index 84387c63..3b95b73b 100644
--- a/deepobs/pytorch/testproblems/testproblems_modules.py
+++ b/deepobs/pytorch/testproblems/testproblems_modules.py
@@ -713,6 +713,7 @@ def __init__(self, seq_len, hidden_dim, vocab_size, num_layers):
         self.embedding = nn.Embedding(
             num_embeddings=vocab_size, embedding_dim=hidden_dim
         )
+        self.dropout = nn.Dropout(p=0.2)
         self.lstm = nn.LSTM(
             input_size=hidden_dim,
             hidden_size=hidden_dim,
@@ -720,22 +721,28 @@ def __init__(self, seq_len, hidden_dim, vocab_size, num_layers):
             dropout=0.2,
             batch_first=True,
         )
+        """new_bias_l0 = torch.zeros_like(self.lstm.bias_ih_l0, device=self.lstm.bias_ih_l0.device)
+        new_bias_l1 = torch.zeros_like(self.lstm.bias_ih_l1, device=self.lstm.bias_ih_l1.device)
+        del self.lstm.bias_ih_l0
+        del self.lstm.bias_ih_l1
+        self.lstm.bias_ih_l0 = new_bias_l0
+        self.lstm.bias_ih_l1 = new_bias_l1"""
+
         self.dense = nn.Linear(in_features=hidden_dim, out_features=vocab_size)
         # TODO init layers?
 
     def forward(self, x, state=None):
         """state is a tuple for hidden and cell state for initialisation of the lstm"""
-        print("net_char_rnn:forward()")
-        print(f"x.shape = {x.shape}")
-        # print(f"x={x}")
         x = self.embedding(x)
         # if no state is provided, default the state to zeros
+        x = self.dropout(x)
         if state is None:
             x, new_state = self.lstm(x)
         else:
             x, new_state = self.lstm(x, state)
+        x = self.dropout(x)
         output = self.dense(x)
-        print(f"output.shape={output.shape}")
+        output = output.transpose(1, 2)
         return output   # , new_state
 
 

From cfa7f8f2cc50842260126639a1b8c3dd101f931f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <tim@gruber-schaefer.de>
Date: Mon, 16 Aug 2021 11:25:13 +0200
Subject: [PATCH 07/17] [ADD] LSTM PyTorch: different parameter count

---
 tests/test_testproblems.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/test_testproblems.py b/tests/test_testproblems.py
index 10a8ad95..2f35912e 100644
--- a/tests/test_testproblems.py
+++ b/tests/test_testproblems.py
@@ -147,8 +147,13 @@ def _check_parameters(tproblem, framework):
     num_param = []
 
     if framework == "pytorch":
-        for parameter in tproblem.net.parameters():
-            num_param.append(parameter.numel())
+        for name, parameter in tproblem.net.named_parameters():
+            if "weight_hh_l" in name:
+                num_param[-1] += parameter.numel()
+            elif "bias_hh_l" in name:
+                pass
+            else:
+                num_param.append(parameter.numel())
     elif framework == "tensorflow":
         num_param = [np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()]
 

From a899ed29d41d76236a85720b74fe3cede9d864cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <tim@gruber-schaefer.de>
Date: Mon, 16 Aug 2021 13:49:50 +0200
Subject: [PATCH 08/17] [ADD] SGD Runner

---
 examples/runner_sgd_pytorch.py | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 examples/runner_sgd_pytorch.py

diff --git a/examples/runner_sgd_pytorch.py b/examples/runner_sgd_pytorch.py
new file mode 100644
index 00000000..38fee120
--- /dev/null
+++ b/examples/runner_sgd_pytorch.py
@@ -0,0 +1,11 @@
+"""StandardRunner: Default SGD."""
+
+from torch.optim import SGD
+
+from deepobs import pytorch as pt
+
+optimizer_class = SGD
+hyperparams = {"lr": {"type": float}}
+
+runner = pt.runners.StandardRunner(optimizer_class, hyperparams)
+runner.run()

From b46ccf35989abb714956d40246a31ce8463e197c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <ti.schaefer@student.uni-tuebingen.de>
Date: Fri, 17 Sep 2021 11:40:22 +0200
Subject: [PATCH 09/17] [REF] adjust NR_PT_TESTPROBLEMS to 21

---
 tests/test_testproblems.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_testproblems.py b/tests/test_testproblems.py
index 2f35912e..0258624b 100644
--- a/tests/test_testproblems.py
+++ b/tests/test_testproblems.py
@@ -19,7 +19,7 @@
 
 # Basic Settings of the Test
 BATCH_SIZE = 8
-NR_PT_TESTPROBLEMS = 20
+NR_PT_TESTPROBLEMS = 21
 NR_TF_TESTPROBLEMS = 27
 DEVICES = ["cpu", "cuda:0"] if torch.cuda.is_available() else ["cpu"]
 FRAMEWORKS = ["pytorch", "tensorflow"]

From 6ba92d0d96c68aee86346aad434b0f377387864b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <ti.schaefer@student.uni-tuebingen.de>
Date: Fri, 17 Sep 2021 11:51:18 +0200
Subject: [PATCH 10/17] [ADD] Tolstoi: PyTorch: redundant bias: set to zero and
 requires_grad=False

---
 deepobs/pytorch/testproblems/testproblems_modules.py | 11 ++++-------
 tests/test_testproblems.py                           |  6 +++---
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/deepobs/pytorch/testproblems/testproblems_modules.py b/deepobs/pytorch/testproblems/testproblems_modules.py
index 3b95b73b..dfd07841 100644
--- a/deepobs/pytorch/testproblems/testproblems_modules.py
+++ b/deepobs/pytorch/testproblems/testproblems_modules.py
@@ -721,15 +721,12 @@ def __init__(self, seq_len, hidden_dim, vocab_size, num_layers):
             dropout=0.2,
             batch_first=True,
         )
-        """new_bias_l0 = torch.zeros_like(self.lstm.bias_ih_l0, device=self.lstm.bias_ih_l0.device)
-        new_bias_l1 = torch.zeros_like(self.lstm.bias_ih_l1, device=self.lstm.bias_ih_l1.device)
-        del self.lstm.bias_ih_l0
-        del self.lstm.bias_ih_l1
-        self.lstm.bias_ih_l0 = new_bias_l0
-        self.lstm.bias_ih_l1 = new_bias_l1"""
+        self.lstm.bias_ih_l0.data = torch.zeros_like(self.lstm.bias_ih_l0, device=self.lstm.bias_ih_l0.device)
+        self.lstm.bias_ih_l1.data = torch.zeros_like(self.lstm.bias_ih_l1, device=self.lstm.bias_ih_l0.device)
+        self.lstm.bias_ih_l0.requires_grad = False
+        self.lstm.bias_ih_l1.requires_grad = False
 
         self.dense = nn.Linear(in_features=hidden_dim, out_features=vocab_size)
-        # TODO init layers?
 
     def forward(self, x, state=None):
         """state is a tuple for hidden and cell state for initialisation of the lstm"""
diff --git a/tests/test_testproblems.py b/tests/test_testproblems.py
index 0258624b..a12bb353 100644
--- a/tests/test_testproblems.py
+++ b/tests/test_testproblems.py
@@ -148,10 +148,10 @@ def _check_parameters(tproblem, framework):
 
     if framework == "pytorch":
         for name, parameter in tproblem.net.named_parameters():
-            if "weight_hh_l" in name:
+            if parameter.requires_grad is False:
+                continue
+            elif "weight_hh_l" in name:
                 num_param[-1] += parameter.numel()
-            elif "bias_hh_l" in name:
-                pass
             else:
                 num_param.append(parameter.numel())
     elif framework == "tensorflow":

From a46b8ef809dd875388cd67a906316f6af20302c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <ti.schaefer@student.uni-tuebingen.de>
Date: Fri, 17 Sep 2021 11:55:44 +0200
Subject: [PATCH 11/17] [REF] Tolstoi, PyTorch: adjust dropout probability to
 tensorflow

---
 deepobs/pytorch/testproblems/testproblems_modules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepobs/pytorch/testproblems/testproblems_modules.py b/deepobs/pytorch/testproblems/testproblems_modules.py
index dfd07841..dc125157 100644
--- a/deepobs/pytorch/testproblems/testproblems_modules.py
+++ b/deepobs/pytorch/testproblems/testproblems_modules.py
@@ -718,7 +718,7 @@ def __init__(self, seq_len, hidden_dim, vocab_size, num_layers):
             input_size=hidden_dim,
             hidden_size=hidden_dim,
             num_layers=num_layers,
-            dropout=0.2,
+            dropout=0.36, # tensorflow two dropouts with keep=0.8 each -> dropout=1-0.8*0.8=0.36
             batch_first=True,
         )
         self.lstm.bias_ih_l0.data = torch.zeros_like(self.lstm.bias_ih_l0, device=self.lstm.bias_ih_l0.device)

From ba2f00206c0f252fc3a32a83385bf42ae6c584d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <ti.schaefer@student.uni-tuebingen.de>
Date: Fri, 17 Sep 2021 12:02:57 +0200
Subject: [PATCH 12/17] [REF] adjust TODO

---
 deepobs/pytorch/testproblems/tolstoi_char_rnn.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/deepobs/pytorch/testproblems/tolstoi_char_rnn.py b/deepobs/pytorch/testproblems/tolstoi_char_rnn.py
index b104ef67..a6f90967 100644
--- a/deepobs/pytorch/testproblems/tolstoi_char_rnn.py
+++ b/deepobs/pytorch/testproblems/tolstoi_char_rnn.py
@@ -14,14 +14,10 @@ class tolstoi_char_rnn(WeightRegularizedTestproblem):
     """
 
     # TODO check differences compared to tensorflow
-    # - lstm layer has two bias -> "_check_parameters()"
+    # - often the test on cuda fails: acc is greater than 1.0
     # - loss function:
     #   - tensorflow: mean across time, sum across batch
     #   - pytorch: mean across all
-    # - lstm parameters counted separately (weight_{ih|hh}_l{i})
-    # - dropout layers
-    #   - tensorflow: dropout before and after each layer with keep=0.8
-    #   - pytorch: dropout in-between LSTM + dropout before and after LSTM
 
     def __init__(self, batch_size, l2_reg=0.0005):
         """Create a new char_rnn test problem instance on Tolstoi.

From 523e4a8270eba83e7d9c898951b2822c4b804628 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <ti.schaefer@student.uni-tuebingen.de>
Date: Thu, 7 Oct 2021 11:08:15 +0200
Subject: [PATCH 13/17] [REF] cleanup

---
 deepobs/pytorch/datasets/tolstoi.py           |  6 +---
 .../pytorch/testproblems/tolstoi_char_rnn.py  | 32 +++++++++++++++++--
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/deepobs/pytorch/datasets/tolstoi.py b/deepobs/pytorch/datasets/tolstoi.py
index f577809a..ba973cd1 100644
--- a/deepobs/pytorch/datasets/tolstoi.py
+++ b/deepobs/pytorch/datasets/tolstoi.py
@@ -94,9 +94,5 @@ def _make_test_dataloader(self):
         return self._make_tolstoi_dataloader(filepath)
 
     def _make_train_and_valid_dataloader(self):
-        # TODO check whether this is intended usage
-        """return self._make_train_and_valid_dataloader_helper(
-            self._make_train_dataloader(),
-            self._make_train_dataloader(),
-        )"""
+        # TODO validation data set
         return self._make_train_dataloader(), self._make_train_dataloader()
diff --git a/deepobs/pytorch/testproblems/tolstoi_char_rnn.py b/deepobs/pytorch/testproblems/tolstoi_char_rnn.py
index a6f90967..93c6ddb1 100644
--- a/deepobs/pytorch/testproblems/tolstoi_char_rnn.py
+++ b/deepobs/pytorch/testproblems/tolstoi_char_rnn.py
@@ -8,9 +8,37 @@
 
 
 class tolstoi_char_rnn(WeightRegularizedTestproblem):
-    """DeepOBS test problem class for char_rnn network on Tolstoi.
+    """DeepOBS test problem class for a two-layer LSTM for character-level language
+    modelling (Char RNN) on Tolstoi's War and Peace.
 
-    TODO: add some more details how the test problem works
+    Some network characteristics:
+
+    - ``128`` hidden units per LSTM cell
+    - sequence length ``50``
+    - cell state is automatically stored in variables between subsequent steps
+    - when the phase placeholder switches its value from one step to the next,
+      the cell state is set to its zero value (meaning that we set to zero state
+      after each round of evaluation, it is therefore important to set the
+      evaluation interval such that we evaluate after a full epoch.)
+
+    Working training parameters are:
+
+    - batch size ``50``
+    - ``200`` epochs
+    - SGD with a learning rate of :math:`\\approx 0.1` works
+
+    Args:
+        batch_size (int): Batch size to use.
+        l2_reg (float): L2-regularization factor. L2-Regularization (weight decay)
+            is used on the weights but not the biases.
+            Defaults to ``5e-4``.
+
+    Attributes:
+        _batch_size: Batch_size for the data of this test problem.
+        _l2_reg: The regularization factor for this test problem
+        data: The dataset used by the test problem (datasets.DataSet instance).
+        loss_function: The loss function for this test problem.
+        net: The torch module (the neural network) that is trained.
     """
 
     # TODO check differences compared to tensorflow

From b71d6730c612bd3f7a37287cc73c44d10bd3eda2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <ti.schaefer@student.uni-tuebingen.de>
Date: Thu, 7 Oct 2021 11:25:07 +0200
Subject: [PATCH 14/17] [FIX] denominator for 2d labels (like in Tolstoi)

---
 deepobs/pytorch/testproblems/testproblem.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepobs/pytorch/testproblems/testproblem.py b/deepobs/pytorch/testproblems/testproblem.py
index b661a50e..d6d689b2 100644
--- a/deepobs/pytorch/testproblems/testproblem.py
+++ b/deepobs/pytorch/testproblems/testproblem.py
@@ -143,7 +143,7 @@ def forward_func():
                 loss = self.loss_function(reduction=reduction)(outputs, labels)
 
             _, predicted = torch.max(outputs.data, 1)
-            total += labels.size(0)
+            total += labels.numel()
             correct += (predicted == labels).sum().item()
 
             accuracy = correct / total

From 3edd50527ca2b1b6843e9312a1ebc538b7381b73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <ti.schaefer@student.uni-tuebingen.de>
Date: Thu, 7 Oct 2021 11:36:11 +0200
Subject: [PATCH 15/17] [REF] cleanup

---
 deepobs/pytorch/testproblems/testproblems_modules.py | 5 +++--
 deepobs/pytorch/testproblems/tolstoi_char_rnn.py     | 9 ---------
 tests/test_testproblems.py                           | 2 +-
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/deepobs/pytorch/testproblems/testproblems_modules.py b/deepobs/pytorch/testproblems/testproblems_modules.py
index dc125157..877a0f8a 100644
--- a/deepobs/pytorch/testproblems/testproblems_modules.py
+++ b/deepobs/pytorch/testproblems/testproblems_modules.py
@@ -718,9 +718,10 @@ def __init__(self, seq_len, hidden_dim, vocab_size, num_layers):
             input_size=hidden_dim,
             hidden_size=hidden_dim,
             num_layers=num_layers,
-            dropout=0.36, # tensorflow two dropouts with keep=0.8 each -> dropout=1-0.8*0.8=0.36
+            dropout=0.36,  # tensorflow two dropouts with keep=0.8 each -> dropout=1-0.8*0.8=0.36
             batch_first=True,
         )
+        # deactivate redundant bias
         self.lstm.bias_ih_l0.data = torch.zeros_like(self.lstm.bias_ih_l0, device=self.lstm.bias_ih_l0.device)
         self.lstm.bias_ih_l1.data = torch.zeros_like(self.lstm.bias_ih_l1, device=self.lstm.bias_ih_l0.device)
         self.lstm.bias_ih_l0.requires_grad = False
@@ -731,8 +732,8 @@ def __init__(self, seq_len, hidden_dim, vocab_size, num_layers):
     def forward(self, x, state=None):
         """state is a tuple for hidden and cell state for initialisation of the lstm"""
         x = self.embedding(x)
-        # if no state is provided, default the state to zeros
         x = self.dropout(x)
+        # if no state is provided, default the state to zeros
         if state is None:
             x, new_state = self.lstm(x)
         else:
diff --git a/deepobs/pytorch/testproblems/tolstoi_char_rnn.py b/deepobs/pytorch/testproblems/tolstoi_char_rnn.py
index 93c6ddb1..7375d293 100644
--- a/deepobs/pytorch/testproblems/tolstoi_char_rnn.py
+++ b/deepobs/pytorch/testproblems/tolstoi_char_rnn.py
@@ -34,19 +34,11 @@ class tolstoi_char_rnn(WeightRegularizedTestproblem):
             Defaults to ``5e-4``.
 
     Attributes:
-        _batch_size: Batch_size for the data of this test problem.
-        _l2_reg: The regularization factor for this test problem
         data: The dataset used by the test problem (datasets.DataSet instance).
         loss_function: The loss function for this test problem.
         net: The torch module (the neural network) that is trained.
     """
 
-    # TODO check differences compared to tensorflow
-    # - often the test on cuda fails: acc is greater than 1.0
-    # - loss function:
-    #   - tensorflow: mean across time, sum across batch
-    #   - pytorch: mean across all
-
     def __init__(self, batch_size, l2_reg=0.0005):
         """Create a new char_rnn test problem instance on Tolstoi.
 
@@ -56,7 +48,6 @@ def __init__(self, batch_size, l2_reg=0.0005):
               is used on the weights but not the biases.
               Defaults to ``5e-4``.
         """
-        print(f"batch_size={batch_size}")
         super(tolstoi_char_rnn, self).__init__(batch_size, l2_reg)
 
     def set_up(self):
diff --git a/tests/test_testproblems.py b/tests/test_testproblems.py
index a12bb353..1fac964f 100644
--- a/tests/test_testproblems.py
+++ b/tests/test_testproblems.py
@@ -150,7 +150,7 @@ def _check_parameters(tproblem, framework):
         for name, parameter in tproblem.net.named_parameters():
             if parameter.requires_grad is False:
                 continue
-            elif "weight_hh_l" in name:
+            elif "weight_hh_l" in name:  # LSTM parameters counted separately in PyTorch
                 num_param[-1] += parameter.numel()
             else:
                 num_param.append(parameter.numel())

From 7be0020ad9342b78f25172f5684bf8fdb11e3c1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <ti.schaefer@student.uni-tuebingen.de>
Date: Thu, 7 Oct 2021 11:44:46 +0200
Subject: [PATCH 16/17] [DEL] remove default sgd

---
 examples/runner_sgd_pytorch.py | 11 -----------
 1 file changed, 11 deletions(-)
 delete mode 100644 examples/runner_sgd_pytorch.py

diff --git a/examples/runner_sgd_pytorch.py b/examples/runner_sgd_pytorch.py
deleted file mode 100644
index 38fee120..00000000
--- a/examples/runner_sgd_pytorch.py
+++ /dev/null
@@ -1,11 +0,0 @@
-"""StandardRunner: Default SGD."""
-
-from torch.optim import SGD
-
-from deepobs import pytorch as pt
-
-optimizer_class = SGD
-hyperparams = {"lr": {"type": float}}
-
-runner = pt.runners.StandardRunner(optimizer_class, hyperparams)
-runner.run()

From 883202f099f9dd29383e1e1e3e8c0e78d78d8db6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Sch=C3=A4fer?= <ti.schaefer@student.uni-tuebingen.de>
Date: Thu, 7 Oct 2021 14:52:41 +0200
Subject: [PATCH 17/17] [REF] separate training and validation data

---
 deepobs/pytorch/datasets/tolstoi.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/deepobs/pytorch/datasets/tolstoi.py b/deepobs/pytorch/datasets/tolstoi.py
index ba973cd1..ffcb23d7 100644
--- a/deepobs/pytorch/datasets/tolstoi.py
+++ b/deepobs/pytorch/datasets/tolstoi.py
@@ -43,10 +43,9 @@ def __init__(self, batch_size, seq_length=50, train_eval_size=653237):
         self._train_eval_size = train_eval_size
         super(tolstoi, self).__init__(batch_size)
 
-    def _make_tolstoi_dataloader(self, filepath):
-        # Load the array of character ids, determine the number of batches that
-        # can be produced, given batch size and sequence lengh
-        arr = np.load(filepath)
+    def _make_tolstoi_dataloader(self, arr):
+        # determine the number of batches that can be produced, given batch size
+        # and sequence lengh
         num_batches = int(
             np.floor((np.size(arr) - 1) / (self._batch_size * self._seq_length))
         )
@@ -80,7 +79,7 @@ def _make_tolstoi_dataloader(self, filepath):
 
     def _make_train_dataloader(self):
         filepath = os.path.join(get_data_dir(), "tolstoi", "train.npy")
-        return self._make_tolstoi_dataloader(filepath)
+        return self._make_tolstoi_dataloader(np.load(filepath))
 
     def _make_train_eval_dataloader(self):
         indices = np.arange(
@@ -91,8 +90,11 @@ def _make_train_eval_dataloader(self):
 
     def _make_test_dataloader(self):
         filepath = os.path.join(get_data_dir(), "tolstoi", "test.npy")
-        return self._make_tolstoi_dataloader(filepath)
+        return self._make_tolstoi_dataloader(np.load(filepath))
 
     def _make_train_and_valid_dataloader(self):
-        # TODO validation data set
-        return self._make_train_dataloader(), self._make_train_dataloader()
+        filepath = os.path.join(get_data_dir(), "tolstoi", "train.npy")
+        data = np.load(filepath)
+        valid_data = data[0: self._train_eval_size]
+        train_data = data[self._train_eval_size:]
+        return self._make_tolstoi_dataloader(valid_data), self._make_tolstoi_dataloader(train_data)