From 22e72c55eb967dd21daaf2974f7b741e67c8d24b Mon Sep 17 00:00:00 2001
From: Lood <loodvn@gmail.com>
Date: Tue, 15 Mar 2022 16:56:57 -0400
Subject: [PATCH 1/3] helper.py: added save_weights and calc_weights Removed
 default theta (make sure to provide theta when calculating weights) added
 extra hardcoded datasets loading dataset from alignments_dir instead if
 specified

---
 DeepSequence/helper.py | 279 +++++++++++++++++++++++++++++++++++------
 1 file changed, 241 insertions(+), 38 deletions(-)
diff --git a/DeepSequence/helper.py b/DeepSequence/helper.py
index d82935e..80abb7e 100644
--- a/DeepSequence/helper.py
+++ b/DeepSequence/helper.py
@@ -1,4 +1,8 @@
 from __future__ import print_function
+from collections import defaultdict
+import cPickle
+import os
+
 import numpy as np
 import theano
 import theano.tensor as T
@@ -9,14 +13,18 @@
 
 class DataHelper:
     def __init__(self,
-        dataset="",
-        alignment_file="",
-        focus_seq_name="",
-        calc_weights=True,
-        working_dir=".",
-        theta=0.2,
-        load_all_sequences=True,
-        alphabet_type="protein"):
+                 dataset="",
+                 alignment_file="",
+                 focus_seq_name="",
+                 calc_weights=True,
+                 working_dir=".",
+                 theta=None,
+                 load_all_sequences=True,
+                 alphabet_type="protein",
+                 weights_dir="",
+                 save_weights=False,
+                 alignments_dir=None,
+                 ):
 
         """
         Class to load and organize alignment data.
@@ -42,6 +50,8 @@ def __init__(self,
         load_all_sequences:
         alphabet_type: Alphabet type of associated dataset.
                             Options are DNA, RNA, protein, allelic
+        weights_dir: location of the weights, assumed to be in form: <self.dataset>_<etc>.npy,
+            where etc could be theta_0.x
 
         Returns
         ------------
@@ -50,19 +60,24 @@ def __init__(self,
 
         np.random.seed(42)
         self.dataset = dataset
+        self.dataset = self.dataset.split(".a2m")[0]  # Remove prefix (if there's no prefix, this will still be ok)
         self.alignment_file = alignment_file
         self.focus_seq_name = focus_seq_name
         self.working_dir = working_dir
         self.calc_weights = calc_weights
         self.alphabet_type = alphabet_type
+        self.weights_dir = weights_dir
+        self.save_weights = save_weights
+        self.alignments_dir = alignments_dir
 
         # Initalize the elbo of the wt to None
         #   will be useful if eventually doing mutation effect prediction
         self.wt_elbo = None
 
         # Alignment processing parameters
-        self.theta = theta
-
+        # Note: Script will fail if calc_weights is True and theta is not set
+        if theta is not None:
+            self.theta = theta
         # If I am running tests with the model, I don't need all the
         #    sequences loaded
         self.load_all_sequences = load_all_sequences
@@ -75,9 +90,15 @@ def __init__(self,
         if self.alphabet_type == "protein":
             self.alphabet = "ACDEFGHIKLMNPQRSTVWY"
             self.reorder_alphabet = "DEKRHNQSTPGAVILMCFYW"
+        elif self.alphabet_type == "protein_withgap":
+            self.alphabet = "ACDEFGHIKLMNPQRSTVWY-"
+            self.reorder_alphabet = "DEKRHNQSTPGAVILMCFYW-"
         elif self.alphabet_type == "RNA":
             self.alphabet = "ACGU"
             self.reorder_alphabet = "ACGU"
+        elif self.alphabet_type == "RNA_withgap":
+            self.alphabet = "ACGU-"
+            self.reorder_alphabet = "ACGU-"
         elif self.alphabet_type == "DNA":
             self.alphabet = "ACGT"
             self.reorder_alphabet = "ACGT"
@@ -85,7 +106,7 @@ def __init__(self,
             self.alphabet = "012"
             self.reorder_alphabet = "012"
 
-        #then generate the experimental data
+        # then generate the experimental data
         self.gen_basic_alignment()
 
         if self.load_all_sequences:
@@ -94,22 +115,159 @@ def __init__(self,
     def configure_datasets(self):
 
         if self.dataset == "BLAT_ECOLX":
-            self.alignment_file = self.working_dir+"/datasets/BLAT_ECOLX_hmmerbit_plmc_n5_m30_f50_t0.2_r24-286_id100_b105.a2m"
+            self.alignment_file = self.working_dir + "/datasets/alignments/BLAT_ECOLX_hmmerbit_plmc_n5_m30_f50_t0.2_r24-286_id100_b105.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "BLAT_ECOLX_withgaps":
+            self.alignment_file = self.working_dir + "/datasets/alignments/BLAT_ECOLX_1_b0.5.a2m"
+            self.alphabet_type = "protein_withgap"
+            self.theta = 0.2
+
+        elif self.dataset == "PTEN_HUMAN_withgaps":
+            self.alignment_file = self.working_dir + "/datasets/alignments/PTEN_HUMAN_1_b0.3.a2m"
+            self.alphabet_type = "protein_withgap"
+            self.theta = 0.2
+
+        elif self.dataset == "HIS7_YEAST_withgaps":
+            self.alignment_file = self.working_dir + "/datasets/alignments/HIS7_YEAST_1_b0.5.a2m"
+            self.alphabet_type = "protein_withgap"
+            self.theta = 0.2
+
+        elif self.dataset == "P53_HUMAN_withgaps":
+            self.alignment_file = self.working_dir + "/datasets/alignments/P53_HUMAN_r90-300_uniref100_Nov17_b0.06.a2m"
+            self.alphabet_type = "protein_withgap"
+            self.theta = 0.2
+
+        elif self.dataset == "SNORNA_YEAST_withgaps":
+            self.alignment_file = self.working_dir + "/datasets/alignments/CL00100_cmRF00012_m70_f50.a2m"
+            self.alphabet_type = "RNA_withgap"
+            self.theta = 0.2
+
+        elif self.dataset == 'naive_repertoire_fullseqs_withgaps':
+            self.alignment_file = self.working_dir + '/datasets/alignments/naive_repertoire_annotated_aligned_fullseqs.fa'
+            self.alphabet_type = "protein_withgap"
             self.theta = 0.2
 
         elif self.dataset == "PABP_YEAST":
-            self.alignment_file = self.working_dir+"/datasets/PABP_YEAST_hmmerbit_plmc_n5_m30_f50_t0.2_r115-210_id100_b48.a2m"
+            self.alignment_file = self.working_dir + "/datasets/PABP_YEAST_hmmerbit_plmc_n5_m30_f50_t0.2_r115-210_id100_b48.a2m"
             self.theta = 0.2
 
         elif self.dataset == "DLG4_RAT":
-            self.alignment_file = self.working_dir+"/datasets/DLG4_RAT_hmmerbit_plmc_n5_m30_f50_t0.2_r300-400_id100_b50.a2m"
+            self.alignment_file = self.working_dir + "/datasets/DLG4_RAT_hmmerbit_plmc_n5_m30_f50_t0.2_r300-400_id100_b50.a2m"
             self.theta = 0.2
 
         elif self.dataset == "trna":
-            self.alignment_file = self.working_dir+"/datasets/RF00005_CCU.fasta"
+            self.alignment_file = self.working_dir + "/datasets/RF00005_CCU.fasta"
             self.alphabet_type = "RNA"
             self.theta = 0.2
 
+        elif self.dataset == "PA_FLU":
+            self.alignment_file = self.working_dir + "/datasets/alignments/PA_FLU_1_b0.5.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "PA_FLU_orig":
+            self.alignment_file = self.working_dir + "/datasets/alignments/PA_FLU_1_b0.5.a2m"
+            self.theta = 0.01
+
+        elif self.dataset == "PA_FLU_jonny":
+            self.alignment_file = self.working_dir + "/datasets/alignments/PA_FLU_jonny_1_b0.5.a2m"
+            self.theta = 0.2
+            self.calc_weights = False
+
+        elif self.dataset == "DPO1_KLULA":
+            self.alignment_file = self.working_dir + "/datasets/alignments/DPO1_KLULA_b0.1.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "DRTS_PLAFK":
+            self.alignment_file = self.working_dir + "/datasets/alignments/DRTS_PLAFK_r1-280_b0.1.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "HIS4_THEMA":
+            self.alignment_file = self.working_dir + "/datasets/alignments/HIS4_THEMA_b0.2.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "HIS4_YEAST":
+            self.alignment_file = self.working_dir + "/datasets/alignments/HIS4_YEAST_b0.2.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "HIS4_YEAST_b0.1":
+            self.alignment_file = self.working_dir + "/datasets/alignments/HIS4_YEAST_b0.1.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "HIS4_THEMA_b0.1":
+            self.alignment_file = self.working_dir + "/datasets/alignments/HIS4_THEMA_b0.1.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "TRP_YEAST":
+            self.alignment_file = self.working_dir + "/datasets/alignments/TRP_YEAST_b0.1.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "TRPB1_THEMA":
+            self.alignment_file = self.working_dir + "/datasets/alignments/TRPB1_THEMA_b0.4.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "TRPB2_THEMA":
+            self.alignment_file = self.working_dir + "/datasets/alignments/TRPB2_THEMA_b0.4.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "TRPF_YEAST":
+            self.alignment_file = self.working_dir + "/datasets/alignments/TRPF_YEAST_b0.2.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "TRPF_YEAST_b0.1":
+            self.alignment_file = self.working_dir + "/datasets/alignments/TRPF_YEAST_b0.1.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "BLAT_ECOLX_1_seqid0.3":
+            self.alignment_file = self.working_dir + "/datasets/alignments/BLAT_ECOLX_1_b0.5_seqid0.3.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "BLAT_ECOLX_1":
+            self.alignment_file = self.working_dir + "/datasets/alignments/BLAT_ECOLX_1_b0.5.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "AMIE_PSEAE_1_seqid0.3":
+            self.alignment_file = self.working_dir + "/datasets/alignments/AMIE_PSEAE_1_b0.3_seqid0.3.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "AMIE_PSEAE_1":
+            self.alignment_file = self.working_dir + "/datasets/alignments/AMIE_PSEAE_1_b0.3.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "P53_HUMAN":
+            self.alignment_file = self.working_dir + "/datasets/alignments/P53_HUMAN_r90-300_uniref100_Nov17_b0.06.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == 'BF520_env':
+            self.alignment_file = self.working_dir + '/datasets/alignments/BF520_env_1_b0.5.a2m'
+            self.theta = 0.01
+
+        elif self.dataset == 'BG505_env':
+            self.alignment_file = self.working_dir + '/datasets/alignments/BG505_env_1_b0.5.a2m'
+            self.theta = 0.01
+
+        elif self.dataset == 'deltaNTRP_YEAST':
+            self.alignment_file = self.working_dir + '/datasets/alignments/deltaNTRP_YEAST_b0.4.a2m'
+            self.theta = 0.2
+
+        elif self.dataset == 'PfDHFR':
+            self.alignment_file = self.working_dir + '/datasets/alignments/PfDHFR_b0.1.a2m'
+            self.theta = 0.2
+
+        elif self.dataset == 'TP-DNAP1':
+            self.alignment_file = self.working_dir + '/datasets/alignments/TP-DNAP1_b0.1.a2m'
+            self.theta = 0.2
+
+        elif self.dataset == 'BRCA1_HUMAN':
+            self.alignment_file = self.working_dir + '/datasets/alignments/BRCA1_HUMAN_1_b0.5.a2m'
+            self.theta = 0.2
+
+        else:
+            if self.alignments_dir is not None:
+                self.alignment_file = os.path.join(self.alignments_dir, self.dataset + ".a2m")
+            else:
+                self.alignment_file = self.working_dir + '/datasets/alignments/' + self.dataset + '.a2m'
+        assert os.path.isfile(self.alignment_file), "Alignment file not found: " + self.alignment_file
 
     def one_hot_3D(self, s):
         """ Transform sequence string into one-hot aa vector"""
@@ -117,18 +275,18 @@ def one_hot_3D(self, s):
         x = np.zeros((len(s), len(self.alphabet)))
         for i, letter in enumerate(s):
             if letter in self.aa_dict:
-                x[i , self.aa_dict[letter]] = 1
+                x[i, self.aa_dict[letter]] = 1
         return x
 
     def gen_basic_alignment(self):
         """ Read training alignment and store basics in class instance """
         # Make a dictionary that goes from aa to a number for one-hot
         self.aa_dict = {}
-        for i,aa in enumerate(self.alphabet):
+        for i, aa in enumerate(self.alphabet):
             self.aa_dict[aa] = i
 
         # Do the inverse as well
-        self.num_to_aa = {i:aa for aa,i in self.aa_dict.items()}
+        self.num_to_aa = {i: aa for aa, i in self.aa_dict.items()}
 
         ix = np.array([self.alphabet.find(s) for s in self.reorder_alphabet])
 
@@ -162,8 +320,12 @@ def gen_basic_alignment(self):
 
         # We also expect the focus sequence to be formatted as:
         # >[NAME]/[start]-[end]
-        focus_loc = self.focus_seq_name.split("/")[-1]
-        start,stop = focus_loc.split("-")
+        focus_loc = self.focus_seq_name
+        # Can include extra information (e.g. a custom weight) after the fasta header
+        if ':' in focus_loc:
+            focus_loc = focus_loc[:focus_loc.rfind(':')]
+        focus_loc = focus_loc.split("/")[-1]
+        start, stop = focus_loc.split("-")
         self.focus_start_loc = int(start)
         self.focus_stop_loc = int(stop)
         self.uniprot_focus_cols_list \
@@ -173,21 +335,20 @@ def gen_basic_alignment(self):
         self.uniprot_focus_col_to_focus_idx \
             = {idx_col+int(start):idx_col for idx_col in self.focus_cols}
 
-
     def gen_full_alignment(self):
 
         # Get only the focus columns
-        for seq_name,sequence in self.seq_name_to_sequence.items():
+        for seq_name, sequence in self.seq_name_to_sequence.items():
             # Replace periods with dashes (the uppercase equivalent)
-            sequence = sequence.replace(".","-")
+            sequence = sequence.replace(".", "-")
 
-            #then get only the focus columns
+            # then get only the focus columns
             self.seq_name_to_sequence[seq_name] = [sequence[ix].upper() for ix in self.focus_cols]
 
         # Remove sequences that have bad characters
         alphabet_set = set(list(self.alphabet))
         seq_names_to_remove = []
-        for seq_name,sequence in self.seq_name_to_sequence.items():
+        for seq_name, sequence in self.seq_name_to_sequence.items():
             for letter in sequence:
                 if letter not in alphabet_set and letter != "-":
                     seq_names_to_remove.append(seq_name)
@@ -197,17 +358,16 @@ def gen_full_alignment(self):
             del self.seq_name_to_sequence[seq_name]
 
         # Encode the sequences
-        print ("Encoding sequences")
-        self.x_train = np.zeros((len(self.seq_name_to_sequence.keys()),len(self.focus_cols),len(self.alphabet)))
+        print("Encoding sequences")
+        self.x_train = np.zeros((len(self.seq_name_to_sequence.keys()), len(self.focus_cols), len(self.alphabet)))
         self.x_train_name_list = []
-        for i,seq_name in enumerate(self.seq_name_to_sequence.keys()):
+        for i, seq_name in enumerate(self.seq_name_to_sequence.keys()):
             sequence = self.seq_name_to_sequence[seq_name]
             self.x_train_name_list.append(seq_name)
-            for j,letter in enumerate(sequence):
+            for j, letter in enumerate(sequence):
                 if letter in self.aa_dict:
                     k = self.aa_dict[letter]
-                    self.x_train[i,j,k] = 1.0
-
+                    self.x_train[i, j, k] = 1.0
 
         # Fast sequence weights with Theano
         if self.calc_weights:
@@ -226,16 +386,58 @@ def gen_full_alignment(self):
             #
             self.weights = weightfun(self.x_train, self.theta)[0]
 
+            if self.save_weights:
+                print("Saving sequence weights, dataset={} in  dir {}".format(self.dataset, self.weights_dir))
+                if os.path.isdir(self.weights_dir):
+                    weights_dir_found = self.weights_dir
+                else:
+                    weights_dir_found = os.path.join(self.working_dir, self.weights_dir)
+                    assert os.path.isdir(weights_dir_found), "Could not find weights directory: {} given, expanded to {} using working_dir".format(self.weights_dir, weights_dir_found)
+                # e.g. BLAT_ECOLX_theta_0.2.npy
+                filename_out = os.path.join(weights_dir_found, "{}_theta_{}.npy".format(self.dataset, self.theta))
+                print("Saving weights to {}".format(filename_out))
+                np.save(filename_out, self.weights)
+
         else:
-            # If not using weights, use an isotropic weight matrix
-            self.weights = np.ones(self.x_train.shape[0])
+            if ':' in self.focus_seq_name:
+                print("Loading detected sequence weights")
+                self.weights = np.zeros(self.x_train.shape[0])
+                for i, seq_name in enumerate(self.x_train_name_list):
+                    self.weights[i] = float(seq_name.split(':')[-1])
+            elif self.weights_dir != "":
+                print("Loading sequence weights from file, looking for {} in {}".format(self.dataset, self.weights_dir))
+
+                # Get UniProt ID (the prefix before the second underscore)
+                # TODO Note: This fails for some of the MSA,weight pairs in the original DeepSeq dataset,
+                #  since they don't have unique UniProt ids
+                dataset_prefix = "_".join(self.dataset.split("_")[:2])
+                # Set path
+                if os.path.isdir(self.weights_dir):
+                    weights_dir_found = self.weights_dir
+                else:
+                    weights_dir_found = os.path.join(self.working_dir, self.weights_dir)
+                assert os.path.isdir(weights_dir_found), "Could not find weights directory: {} given, expanded to {}".format(
+                        self.weights_dir, weights_dir_found)
+
+                # Find weights file using dataset prefix
+                found = [file for file in os.listdir(weights_dir_found) if file.startswith(dataset_prefix) and file.endswith(".npy")]
+                assert len(found) == 1, \
+                    "Could not find unique weights file for dataset {} with prefix {}, in {}, found {} files"\
+                    .format(self.dataset, dataset_prefix, weights_dir_found, found)
+                weights_location = os.path.join(weights_dir_found, found[0])
+
+                self.weights = np.load(file=weights_location)
+                print("Weights loaded from {}".format(weights_location))
+            else:
+                # If not using weights, use an isotropic weight matrix
+                print("Not using weights, using isotropic weight matrix")
+                self.weights = np.ones(self.x_train.shape[0])
 
         self.Neff = np.sum(self.weights)
 
         print ("Neff =",str(self.Neff))
         print ("Data Shape =",self.x_train.shape)
 
-
     def delta_elbo(self, model, mutant_tuple_list, N_pred_iterations=10):
 
         for pos,wt_aa,mut_aa in mutant_tuple_list:
@@ -248,7 +450,6 @@ def delta_elbo(self, model, mutant_tuple_list, N_pred_iterations=10):
         for pos,wt_aa,mut_aa in mutant_tuple_list:
             mut_seq[self.uniprot_focus_col_to_focus_idx[pos]] = mut_aa
 
-
         if self.wt_elbo == None:
             mutant_sequences = [self.focus_seq_trimmed, mut_seq]
         else:
@@ -266,7 +467,6 @@ def delta_elbo(self, model, mutant_tuple_list, N_pred_iterations=10):
         prediction_matrix = np.zeros((mutant_sequences_one_hot.shape[0],N_pred_iterations))
         idx_batch = np.arange(mutant_sequences_one_hot.shape[0])
         for i in range(N_pred_iterations):
-
             batch_preds, _, _ = model.all_likelihood_components(mutant_sequences_one_hot)
 
             prediction_matrix[:,i] = batch_preds
@@ -485,7 +685,6 @@ def get_pattern_activations(self, model, update_num, filename_prefix="",
 
         OUTPUT.close()
 
-
     def get_embeddings(self, model, update_num, filename_prefix="",
                         verbose=False, minibatch_size=2000):
         """ Save the latent variables from all the sequences in the alignment """
@@ -504,7 +703,6 @@ def get_embeddings(self, model, update_num, filename_prefix="",
             header_list = mu_header_list + log_sigma_header_list
             OUTPUT.write("update_num,name,"+",".join(header_list)+"\n")
 
-
         batch_order = np.arange(len(self.x_train_name_list))
 
         for i in range(0,len(self.x_train_name_list),minibatch_size):
@@ -540,6 +738,7 @@ def get_elbo_samples(self, model, N_pred_iterations=100, minibatch_size=2000):
                 for k,idx_batch in enumerate(batch_index.tolist()):
                     self.prediction_matrix[idx_batch][i]= batch_preds[k]
 
+
 def gen_job_string(data_params, model_params):
     """
         Generates a unique job string given data and model parameters.
@@ -574,6 +773,7 @@ def gen_job_string(data_params, model_params):
     encoder_architecture_str = "-".join([str(size) for size in encoder_architecture])
     decoder_architecture_str = "-".join([str(size) for size in decoder_architecture])
 
+    # Note: If job_str is too long it will cause an error when saving
     job_str = "vae_output_encoder-"+encoder_architecture_str+"_Nlatent-"+str(n_latent)\
         +"_decoder-"+decoder_architecture_str
 
@@ -582,6 +782,9 @@ def gen_job_string(data_params, model_params):
         if data_id not in written_out_vals:
             if str(type(data_val)) == "<type 'list'>":
                 job_id_list.append(data_id+"-"+"-".join([str(val) for val in data_val]))
+            # LvN: Skipped '/' character because it causes errors when using job_id as filename
+            elif isinstance(data_val, str) and "/" in data_val:
+                pass
             else:
                 job_id_list.append(data_id+"-"+str(data_val))
 

From c0d3bad1ff205f25fa9950d8fd397137f7ba22ff Mon Sep 17 00:00:00 2001
From: Lood <loodvn@gmail.com>
Date: Tue, 15 Mar 2022 17:02:27 -0400
Subject: [PATCH 2/3] added calc_weights.py

---
 calc_weights.py | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 calc_weights.py

diff --git a/calc_weights.py b/calc_weights.py
new file mode 100644
index 0000000..b43a498
--- /dev/null
+++ b/calc_weights.py
@@ -0,0 +1,45 @@
+# Basically a copy of the first part of run_svi.py, but not loading the model after calculating the weights
+import time
+import sys
+
+sys.path.insert(0, "../DeepSequence/")
+import numpy as np
+
+import helper
+import argparse
+
+parser = argparse.ArgumentParser(description="Calculating the weights and storing in weights_dir.")
+parser.add_argument("--dataset", type=str, default="BLAT_ECOLX",
+                    help="Dataset name for fitting model.")
+parser.add_argument("--theta-override", type=float, default=None,
+                    help="Override the model theta.")
+# Keeping this different from weights_dir just so that we don't make mistakes and overwrite weights
+parser.add_argument("--alignments_dir", type=str, help="Overrides the default ./datasets/alignments/")
+parser.add_argument("--weights_dir_out", type=str, default="", help="Location to store weights.")
+args = parser.parse_args()
+
+# DataHelper expects the dataset name without extension
+args.dataset = args.dataset.split(".a2m")[0]
+assert not args.dataset.endswith(".a2m")
+
+data_params = {
+    "dataset": args.dataset,
+    "weights_dir": args.weights_dir_out,
+}
+
+if __name__ == "__main__":
+    start_time = time.time()
+
+    data_helper = helper.DataHelper(dataset=data_params["dataset"],
+                                    working_dir='.',
+                                    theta=args.theta_override,
+                                    weights_dir=data_params["weights_dir"],
+                                    calc_weights=True,
+                                    alignments_dir=args.alignments_dir,
+                                    save_weights=True,
+                                    )
+    # write out what theta was used
+    data_params['theta'] = data_helper.theta
+
+    end_time = time.time()
+    print("Done in " + str(time.time() - start_time) + " seconds")

From 04d97a593d5a21e96eb83a1df951f88f66c42f71 Mon Sep 17 00:00:00 2001
From: Lood <loodvn@gmail.com>
Date: Tue, 15 Mar 2022 17:07:51 -0400
Subject: [PATCH 3/3] added slurm example

---
 examples/scripts/calc_weights_job_array.sh | 36 ++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 examples/scripts/calc_weights_job_array.sh

diff --git a/examples/scripts/calc_weights_job_array.sh b/examples/scripts/calc_weights_job_array.sh
new file mode 100644
index 0000000..f3cdec4
--- /dev/null
+++ b/examples/scripts/calc_weights_job_array.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+#SBATCH -c 2                              # Request one core
+#SBATCH -N 1                              # Request one node (if you request more than one core with -c, also using
+                                          # -N 1 means all cores will be on the same node)
+#SBATCH -t 0-5:59                         # Runtime in D-HH:MM format
+#SBATCH -p short                          # Partition to run in
+#SBATCH --mem=10G                         # Memory total in MB (for all cores)
+
+# To get email notifications, set both of these options below
+##SBATCH --mail-type=TIME_LIMIT_80,TIME_LIMIT,FAIL,ARRAY_TASKS
+##SBATCH --mail-user="<your_email>@hms.harvard.edu"
+
+#SBATCH --job-name="deepseq_calcweights_date"
+# Job array-specific
+#SBATCH --output=slurm-%A_%a-%x.out  # File to which STDOUT + STDERR will be written, %A: jobID, %a: array task ID, %x: jobname
+#SBATCH --array=0-41%10  		  # Job arrays (e.g. 1-100 with a maximum of 5 jobs at once)
+
+hostname
+pwd
+module load gcc/6.2.0 cuda/9.0
+export THEANO_FLAGS='floatX=float32,device=cuda,force_device=True' # Otherwise will only raise a warning and carry on with CPU
+
+# To generate this file from a directory, just do e.g. 'ls -1 ALIGNMENTS_DIR/*.a2m > msas.txt'
+lines=( $(cat "msas.txt") )
+dataset_name=${lines[$SLURM_ARRAY_TASK_ID]}
+echo $dataset_name
+
+## Monitor GPU usage (store outputs in ./gpu_logs/)
+#/home/lov701/job_gpu_monitor.sh gpu_logs &
+
+srun stdbuf -oL -eL /n/groups/marks/users/aaron/deep_seqs/deep_seqs_env/bin/python \
+  /n/groups/marks/users/lood/DeepSequence_runs/examples/calc_weights.py \
+  --dataset $dataset_name \
+  --weights_dir_out /n/groups/marks/users/lood/DeepSequence_runs/weights_2021_11_16/ \
+  --alignments_dir datasets/alignments/
+#  --theta-override 0.9