williamFalcon · TortillasAlfred · Apr 11, 2020
diff --git a/README.md b/README.md
@@ -94,8 +94,8 @@ cluster = SlurmCluster(
     python_cmd='python3'
 )
 
-# let the cluster know where to email for a change in job status (ie: complete, fail, etc...)
-cluster.notify_job_status(email='[email protected]', on_done=True, on_fail=True)
+# let the cluster know where to email for a change in job status (ie: complete, fail, time limit reached, etc...)
+cluster.notify_job_status(email='[email protected]', on_done=True, on_fail=True, on_time_limit=True)
 
 # set the job options. In this instance, we'll run 20 different models
 # each with its own set of hyperparameters giving each one 1 GPU (ie: taking up 20 GPUs)

diff --git a/docs/hpc/SlurmCluster.md b/docs/hpc/SlurmCluster.md
@@ -25,8 +25,8 @@ cluster = SlurmCluster(
     python_cmd='python3'
 )
 
-# let the cluster know where to email for a change in job status (ie: complete, fail, etc...)
-cluster.notify_job_status(email='[email protected]', on_done=True, on_fail=True)
+# let the cluster know where to email for a change in job status (ie: complete, fail, time limit reached, etc...)
+cluster.notify_job_status(email='[email protected]', on_done=True, on_fail=True, on_time_limit=True)
 
 # set the job options. In this instance, we'll run 20 different models
 # each with its own set of hyperparameters giving each one 1 GPU (ie: taking up 20 GPUs)
@@ -254,20 +254,21 @@ cluster.load_modules([
 ### `notify_job_status`
 
 ``` {.python}
-cluster.notify_job_status(email, on_done, on_fail)  
+cluster.notify_job_status(email, on_done, on_fail, on_time_limit)  
 ```
 
 Loads modules needed to run the job. Your Slurm documentation should have a list of available modules. You can also get those by running ```module avail```.   
 
 - ```email``` String. Email address to get notifications.       
 - ```on_done``` Boolean. If true, you'll get an email when the job completes.      
-- ```on_fail``` Boolean. If true, you'll get an email if the job fails.    
+- ```on_fail``` Boolean. If true, you'll get an email if the job fails.   
+- ```on_time_limit``` Boolean. If true, you'll get an email if the job is stopped for exceeding its time limit. 
 
 **Example**
 
 
 ``` {.python}
-cluster.notify_job_status(email='[email protected]', on_done=True, on_fail=True)   
+cluster.notify_job_status(email='[email protected]', on_done=True, on_fail=True, on_time_limit=True)   
 ```   
 
 ### `optimize_parallel_cluster_gpu`

diff --git a/examples/hpc_cpu_example.py b/examples/hpc_cpu_example.py
@@ -52,7 +52,7 @@ def train(hparams, *args):
 
     # Email results if your hpc supports it.
     cluster.notify_job_status(
-        email='[email protected]', on_done=True, on_fail=True)
+        email='[email protected]', on_done=True, on_fail=True, on_time_limit=True)
 
     # SLURM Module to load.
     cluster.load_modules([

diff --git a/examples/pytorch_hpc_example.py b/examples/pytorch_hpc_example.py
@@ -53,7 +53,7 @@ def train(hparams, *args):
 
     # Email results if your hpc supports it.
     cluster.notify_job_status(
-        email='[email protected]', on_done=True, on_fail=True)
+        email='[email protected]', on_done=True, on_fail=True, on_time_limit=True)
 
     # SLURM Module to load.
     cluster.load_modules([

diff --git a/test_tube/hpc.py b/test_tube/hpc.py
@@ -44,6 +44,7 @@ def __init__(
         self.email = None
         self.notify_on_end = False
         self.notify_on_fail = False
+        self.notify_on_time_limit = False
         self.job_name = None
         self.python_cmd = python_cmd
         self.gpu_type = None
@@ -96,10 +97,11 @@ def add_command(self, cmd):
     def load_modules(self, modules):
         self.modules = modules
 
-    def notify_job_status(self, email, on_done, on_fail):
+    def notify_job_status(self, email, on_done, on_fail, on_time_limit):
         self.email = email
         self.notify_on_end = on_done
         self.notify_on_fail = on_fail
+        self.notify_on_time_limit = on_time_limit
 
     def optimize_parallel_cluster(self, train_function, nb_trials, job_name):
         raise NotImplementedError
@@ -460,6 +462,8 @@ def __build_slurm_command(self, trial, slurm_cmd_script_path, timestamp, exp_i,
             mail_type.append('END')
         if self.notify_on_fail:
             mail_type.append('FAIL')
+        if self.notify_on_time_limit:
+            mail_type.append('TIME_LIMIT')
         if len(mail_type) > 0:
             mail_type_query = [
                 '# Have SLURM send you an email when the job ends or fails',