diff --git a/model.py b/model.py
index 52566ef..7a3a5ec 100644
--- a/model.py
+++ b/model.py
@@ -40,7 +40,7 @@ def forward(self, x, length=None, writer=None):
             _, _, look_ahead_mask = utils.get_masked_with_pad_tensor(self.max_seq, x, x, config.pad_token)
             decoder, w = self.Decoder(x, mask=look_ahead_mask)
             fc = self.fc(decoder)
-            return fc.contiguous() if self.training else fc.contiguous(), [weight.contiguous() for weight in w]
+            return fc.contiguous() if self.training else (fc.contiguous(), [weight.contiguous() for weight in w])
         else:
             return self.generate(x, length, None).contiguous().tolist()
 
diff --git a/train.py b/train.py
index 6d7b618..1c97d37 100644
--- a/train.py
+++ b/train.py
@@ -135,7 +135,8 @@
 
         # switch output device to: gpu-1 ~ gpu-n
         sw_start = time.time()
-        mt.output_device = idx % (torch.cuda.device_count() -1) + 1
+        if torch.cuda.device_count() > 1:
+            mt.output_device = idx % (torch.cuda.device_count() -1) + 1
         sw_end = time.time()
         if config.debug:
             print('output switch time: {}'.format(sw_end - sw_start) )