K-mer blacklist: prevent k-mers from being included in the learned mo…

…dels Kmer blacklisting
aldro61 · Oct 19, 2017 · 9aedb97 · 9aedb97
2 parents 1d9869b + 58e19b6
commit 9aedb97
Show file tree

Hide file tree

Showing 6 changed files with 142 additions and 27 deletions.
diff --git a/core/kover/learning/experiment.py b/core/kover/learning/experiment.py
@@ -32,7 +32,7 @@
 from .set_covering_machine.models import ConjunctionModel, DisjunctionModel
 from .set_covering_machine.rules import LazyKmerRuleList, KmerRuleClassifications
 from .set_covering_machine.scm import SetCoveringMachine
-from ..utils import _duplicate_last_element, _unpack_binary_bytes_from_ints
+from ..utils import _duplicate_last_element, _unpack_binary_bytes_from_ints, _parse_kmer_blacklist
 
 def _get_metrics(predictions, answers):
     if len(predictions.shape) == 1:
@@ -109,7 +109,7 @@ def _predictions(model, kmer_matrix, train_example_idx, test_example_idx, progre
     return train_predictions, test_predictions
 
 
-def _cv_score_hp(hp_values, max_rules, dataset_file, split_name):
+def _cv_score_hp(hp_values, max_rules, dataset_file, split_name, rule_blacklist):
     model_type = hp_values[0]
     p = hp_values[1]
 
@@ -155,6 +155,7 @@ def _tiebreaker(best_utility_idx, rule_risks, model_type):
                       rule_classifications=rule_classifications,
                       positive_example_idx=positive_example_idx,
                       negative_example_idx=negative_example_idx,
+                      rule_blacklist=rule_blacklist,
                       tiebreaker=tiebreaker,
                       iteration_callback=iteration_callback)
         test_predictions_by_model_length = np.array(_duplicate_last_element(test_predictions_by_model_length, max_rules))
@@ -169,7 +170,7 @@ def _tiebreaker(best_utility_idx, rule_risks, model_type):
     return (model_type, p, best_model_length), best_hp_score
 
 
-def _cross_validation(dataset_file, split_name, model_types, p_values, max_rules, n_cpu, progress_callback,
+def _cross_validation(dataset_file, split_name, model_types, p_values, max_rules, rule_blacklist, n_cpu, progress_callback,
                       warning_callback, error_callback):
     """
     Returns the best parameter combination and its cv score
@@ -179,7 +180,7 @@ def _cross_validation(dataset_file, split_name, model_types, p_values, max_rules
 
     logging.debug("Using %d CPUs." % n_cpu)
     pool = Pool(processes=n_cpu)
-    hp_eval_func = partial(_cv_score_hp, dataset_file=dataset_file, split_name=split_name, max_rules=max_rules)
+    hp_eval_func = partial(_cv_score_hp, dataset_file=dataset_file, split_name=split_name, max_rules=max_rules, rule_blacklist=rule_blacklist)
 
     best_hp_score = 1.0
     best_hp = {"model_type": None, "p": None, "max_rules": None}
@@ -199,7 +200,7 @@ def _cross_validation(dataset_file, split_name, model_types, p_values, max_rules
     return best_hp_score, best_hp
 
 
-def _full_train(dataset, split_name, model_type, p, max_rules, max_equiv_rules, random_generator, progress_callback):
+def _full_train(dataset, split_name, model_type, p, max_rules, max_equiv_rules, rule_blacklist, random_generator, progress_callback):
     full_train_progress = {"n_rules": 0.0}
 
     def _iteration_callback(iteration_infos, model_type, equivalent_rules):
@@ -248,6 +249,7 @@ def _tiebreaker(best_utility_idx, rule_risks, model_type):
                   rule_classifications=rule_classifications,
                   positive_example_idx=positive_example_idx,
                   negative_example_idx=negative_example_idx,
+                  rule_blacklist= rule_blacklist,
                   tiebreaker=partial(_tiebreaker,
                                      rule_risks=np.hstack((split.unique_risk_by_kmer[...],
                                                            split.unique_risk_by_anti_kmer[...])),
@@ -288,7 +290,7 @@ def _bound(train_predictions, train_answers, train_example_idx, model, delta, ma
                                                  (216 * delta))))
 
 
-def _bound_score_hp(hp_values, max_rules, dataset_file, split_name, max_equiv_rules, bound_delta,
+def _bound_score_hp(hp_values, max_rules, dataset_file, split_name, max_equiv_rules, rule_blacklist, bound_delta,
                     bound_max_genome_size, random_generator):
     model_type = hp_values[0]
     p = hp_values[1]
@@ -366,6 +368,7 @@ def _tiebreaker(best_utility_idx, rule_risks, model_type):
                   rule_classifications=rule_classifications,
                   positive_example_idx=positive_example_idx,
                   negative_example_idx=negative_example_idx,
+                  rule_blacklist= rule_blacklist,
                   tiebreaker=tiebreaker,
                   iteration_callback=iteration_callback,
                   iteration_rule_importances=True)
@@ -380,7 +383,7 @@ def _tiebreaker(best_utility_idx, rule_risks, model_type):
     return (model_type, p, best_model_length), best_hp_score, best_model, best_rule_importances, best_equivalent_rules
 
 
-def _bound_selection(dataset_file, split_name, model_types, p_values, max_rules, max_equiv_rules, bound_delta,
+def _bound_selection(dataset_file, split_name, model_types, p_values, max_rules, max_equiv_rules, rule_blacklist, bound_delta,
                      bound_max_genome_size, n_cpu, random_generator, progress_callback, warning_callback,
                      error_callback):
     n_hp_combinations = len(model_types) * len(p_values)
@@ -389,7 +392,7 @@ def _bound_selection(dataset_file, split_name, model_types, p_values, max_rules,
     logging.debug("Using %d CPUs." % n_cpu)
     pool = Pool(processes=n_cpu)
     hp_eval_func = partial(_bound_score_hp, dataset_file=dataset_file, split_name=split_name, max_rules=max_rules,
-                           max_equiv_rules=max_equiv_rules, bound_delta=bound_delta,
+                           max_equiv_rules=max_equiv_rules, rule_blacklist=rule_blacklist, bound_delta=bound_delta,
                            bound_max_genome_size=bound_max_genome_size, random_generator=random_generator)
 
     best_hp_score = 1.0
@@ -413,7 +416,42 @@ def _bound_selection(dataset_file, split_name, model_types, p_values, max_rules,
     return best_hp_score, best_hp, best_model, best_rule_importances, best_equiv_rules
 
 
-def learn(dataset_file, split_name, model_type, p, max_rules, max_equiv_rules, parameter_selection, n_cpu, random_seed,
+def _find_rule_blacklist(dataset_file, kmer_blacklist_file, warning_callback):
+    """
+    Finds the index of the rules that must be blacklisted.
+    
+    """
+    dataset = KoverDataset(dataset_file)
+
+    # Find all rules to blacklist
+    rule_blacklist = []
+    if kmer_blacklist_file is not None:
+        kmers_to_blacklist = _parse_kmer_blacklist(kmer_blacklist_file, dataset.kmer_length)
+
+        if kmers_to_blacklist:
+	    # XXX: the k-mers are upper-cased to avoid not finding a match because of the character case
+            kmer_sequences = np.array([x.upper() for x in dataset.kmer_sequences]).tolist()
+            kmer_by_matrix_column = np.array(dataset.kmer_by_matrix_column).tolist() # XXX: each k-mer is there only once (see wiki)
+            n_kmers = len(kmer_sequences)    
+
+            kmers_not_found = []
+	    rule_blacklist = []
+            for k in kmers_to_blacklist:
+                k = k.upper()
+                try:
+                    presence_rule_idx = kmer_by_matrix_column.index(kmer_sequences.index(k))
+		    absence_rule_idx = presence_rule_idx + n_kmers
+		    rule_blacklist += [presence_rule_idx, absence_rule_idx]
+                except ValueError:
+                    kmers_not_found.append(k)
+
+            if(len(kmers_not_found) > 0):
+                warning_callback("The following kmers could not be found in the dataset: " + ", ".join(kmers_not_found))
+
+    return rule_blacklist
+
+
+def learn(dataset_file, split_name, model_type, p, kmer_blacklist_file, max_rules, max_equiv_rules, parameter_selection, n_cpu, random_seed,
           bound_delta=None, bound_max_genome_size=None, progress_callback=None, warning_callback=None, error_callback=None):
     """
     parameter_selection: bound, cv, none (use first value of each if multiple)
@@ -436,8 +474,12 @@ def normal_raise(exception):
     model_type = np.unique(model_type)
     p = np.unique(p)
 
+    rule_blacklist = _find_rule_blacklist(dataset_file=dataset_file, 
+                                          kmer_blacklist_file=kmer_blacklist_file,
+                                          warning_callback=warning_callback)
+
     dataset = KoverDataset(dataset_file)
-
+            
     # Score the hyperparameter combinations
     # ------------------------------------------------------------------------------------------------------------------
     if parameter_selection == "bound":
@@ -450,16 +492,20 @@ def normal_raise(exception):
         best_hp, \
         best_model, \
         best_rule_importances, \
-        best_predictor_equiv_rules = _bound_selection(dataset_file, split_name, model_type, p, max_rules,
-                                                      max_equiv_rules, bound_delta, bound_max_genome_size, n_cpu,
-                                                      random_generator, progress_callback, warning_callback,
-                                                      error_callback)
+        best_predictor_equiv_rules = _bound_selection(dataset_file=dataset_file, split_name=split_name, model_type=model_type, 
+                                                      p_values=p, max_rules=max_rules, max_equiv_rules=max_equiv_rules, 
+                                                      rule_blacklist=rule_blacklist, bound_delta=bound_delta, 
+                                                      bound_max_genome_size=bound_max_genome_size, n_cpu=n_cpu, 
+                                                      random_generator=random_generator, progress_callback=progress_callback, 
+                                                      warning_callback=warning_callback, error_callback=error_callback)
     elif parameter_selection == "cv":
         n_folds = len(dataset.get_split(split_name).folds)
         if n_folds < 1:
             error_callback(Exception("Cross-validation cannot be performed on a split with no folds."))
-        best_hp_score, best_hp = _cross_validation(dataset_file, split_name, model_type, p, max_rules, n_cpu,
-                                                   progress_callback, warning_callback, error_callback)
+        best_hp_score, best_hp = _cross_validation(dataset_file=dataset_file, split_name=split_name, model_types=model_type,
+                                                   p_values=p, max_rules=max_rules, rule_blacklist=rule_blacklist, n_cpu=n_cpu, 
+                                                   progress_callback=progress_callback, warning_callback=warning_callback, 
+                                                   error_callback=error_callback)
     else:
         # Use the first value provided for each parameter
         best_hp = {"model_type": model_type[0], "p": p[0], "max_rules": max_rules}
@@ -473,9 +519,9 @@ def normal_raise(exception):
         rule_importances = best_rule_importances
     else:
         model, rule_importances, \
-        equivalent_rules = _full_train(dataset, split_name, best_hp["model_type"], best_hp["p"],
-                                                        best_hp["max_rules"], max_equiv_rules, random_generator,
-                                                        progress_callback)
+        equivalent_rules = _full_train(dataset=dataset, split_name=split_name, model_type=best_hp["model_type"], p=best_hp["p"], 
+                                       max_rules=best_hp["max_rules"], max_equiv_rules=max_equiv_rules, rule_blacklist=rule_blacklist, 
+                                       random_generator=random_generator, progress_callback=progress_callback)
 
     split = dataset.get_split(split_name)
     train_example_idx = split.train_genome_idx

diff --git a/core/kover/learning/set_covering_machine/rules.py b/core/kover/learning/set_covering_machine/rules.py
@@ -74,7 +74,7 @@ def __getitem__(self, idx):
             type = "presence"
             kmer_idx = self.kmer_by_rule[idx]
         return KmerRule(idx % len(self.kmer_sequences), self.kmer_sequences[kmer_idx], type)
-
+    
     def __len__(self):
         return self.n_rules
 

diff --git a/core/kover/learning/set_covering_machine/scm.py b/core/kover/learning/set_covering_machine/scm.py
@@ -81,7 +81,7 @@ def fit(self, rules, rule_classifications, positive_example_idx, negative_exampl
             rule_blacklist = np.unique(rule_blacklist)
             if len(rule_blacklist) == rule_classifications.shape[1]:
                 raise ValueError("The blacklist cannot include all the rules.")
-            logging.debug("The following rules are blacklisted and will not be considered:" + str(rule_blacklist))
+            logging.debug("Blacklisting: {0:d} kmers ({1:d} rules)".format(len(rule_blacklist) / 2, len(rule_blacklist)))
 
         training_example_idx = np.hstack((positive_example_idx, negative_example_idx))  # Needed for rule importances
         model_rules_idx = []  # Contains the index of the rules in the model

diff --git a/core/kover/utils.py b/core/kover/utils.py
@@ -26,28 +26,57 @@
 def _class_to_string(instance):
     """
     Returns a string representation of the public attributes of a class.
+    
     Parameters:
     -----------
     instance: object
         An instance of any class.
+	
     Returns:
     --------
     string_rep: string
         A string representation of the class and its public attributes.
+	
     Notes:
     -----
     Private attributes must be marked with a leading underscore.
+    
     """
     return instance.__class__.__name__ + "(" + ",".join(
         [str(k) + "=" + str(v) for k, v in instance.__dict__.iteritems() if str(k[0]) != "_"]) + ")"
 
+
 def _duplicate_last_element(l, length):
     """
     Duplicates the last element of a list until a given length is reached. (In-place)
+    
     """
     l += [l[-1]] * (length - len(l))
     return l
 
+
+def _fasta_to_sequences(path):
+    """
+    Reads a FASTA file extracts all the sequences (contigs) that it contains.
+    
+    """
+    contigs = []
+    buffer = None
+    for l in open(path, "r"):
+        if l.startswith(">"):
+            if buffer is not None:
+                contigs.append(buffer.upper())
+                buffer = ""
+        else:
+            if buffer is None:
+                buffer = l.strip()
+            else:
+                buffer += l.strip()
+    if buffer is not None and buffer != "":
+        contigs.append(buffer.upper())
+    return contigs
+
+
 def _hdf5_open_no_chunk_cache(filename, access_type=h.h5f.ACC_RDONLY):
     fid = h.h5f.open(filename, access_type)
     access_property_list = fid.get_access_plist()
@@ -59,9 +88,11 @@ def _hdf5_open_no_chunk_cache(filename, access_type=h.h5f.ACC_RDONLY):
     file_id = h.h5f.open(filename, access_type, fapl=access_property_list)
     return h.File(file_id)
 
+
 def _minimum_uint_size(max_value):
     """
     Find the minimum size unsigned integer type that can store values of at most max_value
+    
     """
     if max_value <= np.iinfo(np.uint8).max:
         return np.uint8
@@ -74,9 +105,11 @@ def _minimum_uint_size(max_value):
     else:
         return np.uint128
 
+
 def _pack_binary_bytes_to_ints(a, pack_size):
     """
     Packs binary values stored in bytes into ints
+    
     """
     if pack_size == 64:
         type = np.uint64
@@ -99,9 +132,11 @@ def _pack_binary_bytes_to_ints(a, pack_size):
 
     return b
 
+
 def _unpack_binary_bytes_from_ints(a):
     """
     Unpacks binary values stored in bytes into ints
+    
     """
     type = a.dtype
 
@@ -127,4 +162,33 @@ def _unpack_binary_bytes_from_ints(a):
         b[i] = tmp > 0
         packed_rows += 1
 
-    return b
+    return b
+
+
+def _parse_kmer_blacklist(blacklist_path, expected_kmer_len):
+    data = []
+
+    # Fasta file format
+    fasta_extensions = [".fasta", ".fa", ".fas", ".fna"]
+    if any(blacklist_path.endswith(extension) for extension in fasta_extensions):
+        data = _fasta_to_sequences(blacklist_path)
+
+    # Other file format (text file with one kmer per line)
+    else:
+        # Loading data and splitting on every line
+        data = [l.rstrip('\n') for l in open(blacklist_path, "r")]
+
+        # Filtering for empty strings (empty lines)
+        data = [x for x in data if x]
+
+    def is_valid_kmer(k):
+        return len(set(k).difference(["A", "C", "G", "T", "a", "c", "g", "t"])) == 0
+    for kmer in data:
+        if not is_valid_kmer(kmer):
+            raise ValueError("{} is not a valid DNA sequence".format(kmer))
+
+    if not(all(len(kmer) == expected_kmer_len for kmer in data)):
+        raise ValueError("Extracted k-mers to blacklist do not have all the same length as the dataset k-mers")
+
+    return data
+
diff --git a/core/setup.py b/core/setup.py
@@ -11,7 +11,7 @@ def finalize_options(self):
 
 setup(
     name = "kover",
-    version = "1.2.2",
+    version = "1.3.0",
     packages = find_packages(),
 
     cmdclass={'build_ext':build_ext},