Skip to content

Commit

Permalink
K-mer blacklist: prevent k-mers from being included in the learned mo…
Browse files Browse the repository at this point in the history
…dels

Kmer blacklisting
  • Loading branch information
aldro61 authored Oct 19, 2017
2 parents 1d9869b + 58e19b6 commit 9aedb97
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 27 deletions.
84 changes: 65 additions & 19 deletions core/kover/learning/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from .set_covering_machine.models import ConjunctionModel, DisjunctionModel
from .set_covering_machine.rules import LazyKmerRuleList, KmerRuleClassifications
from .set_covering_machine.scm import SetCoveringMachine
from ..utils import _duplicate_last_element, _unpack_binary_bytes_from_ints
from ..utils import _duplicate_last_element, _unpack_binary_bytes_from_ints, _parse_kmer_blacklist

def _get_metrics(predictions, answers):
if len(predictions.shape) == 1:
Expand Down Expand Up @@ -109,7 +109,7 @@ def _predictions(model, kmer_matrix, train_example_idx, test_example_idx, progre
return train_predictions, test_predictions


def _cv_score_hp(hp_values, max_rules, dataset_file, split_name):
def _cv_score_hp(hp_values, max_rules, dataset_file, split_name, rule_blacklist):
model_type = hp_values[0]
p = hp_values[1]

Expand Down Expand Up @@ -155,6 +155,7 @@ def _tiebreaker(best_utility_idx, rule_risks, model_type):
rule_classifications=rule_classifications,
positive_example_idx=positive_example_idx,
negative_example_idx=negative_example_idx,
rule_blacklist=rule_blacklist,
tiebreaker=tiebreaker,
iteration_callback=iteration_callback)
test_predictions_by_model_length = np.array(_duplicate_last_element(test_predictions_by_model_length, max_rules))
Expand All @@ -169,7 +170,7 @@ def _tiebreaker(best_utility_idx, rule_risks, model_type):
return (model_type, p, best_model_length), best_hp_score


def _cross_validation(dataset_file, split_name, model_types, p_values, max_rules, n_cpu, progress_callback,
def _cross_validation(dataset_file, split_name, model_types, p_values, max_rules, rule_blacklist, n_cpu, progress_callback,
warning_callback, error_callback):
"""
Returns the best parameter combination and its cv score
Expand All @@ -179,7 +180,7 @@ def _cross_validation(dataset_file, split_name, model_types, p_values, max_rules

logging.debug("Using %d CPUs." % n_cpu)
pool = Pool(processes=n_cpu)
hp_eval_func = partial(_cv_score_hp, dataset_file=dataset_file, split_name=split_name, max_rules=max_rules)
hp_eval_func = partial(_cv_score_hp, dataset_file=dataset_file, split_name=split_name, max_rules=max_rules, rule_blacklist=rule_blacklist)

best_hp_score = 1.0
best_hp = {"model_type": None, "p": None, "max_rules": None}
Expand All @@ -199,7 +200,7 @@ def _cross_validation(dataset_file, split_name, model_types, p_values, max_rules
return best_hp_score, best_hp


def _full_train(dataset, split_name, model_type, p, max_rules, max_equiv_rules, random_generator, progress_callback):
def _full_train(dataset, split_name, model_type, p, max_rules, max_equiv_rules, rule_blacklist, random_generator, progress_callback):
full_train_progress = {"n_rules": 0.0}

def _iteration_callback(iteration_infos, model_type, equivalent_rules):
Expand Down Expand Up @@ -248,6 +249,7 @@ def _tiebreaker(best_utility_idx, rule_risks, model_type):
rule_classifications=rule_classifications,
positive_example_idx=positive_example_idx,
negative_example_idx=negative_example_idx,
rule_blacklist= rule_blacklist,
tiebreaker=partial(_tiebreaker,
rule_risks=np.hstack((split.unique_risk_by_kmer[...],
split.unique_risk_by_anti_kmer[...])),
Expand Down Expand Up @@ -288,7 +290,7 @@ def _bound(train_predictions, train_answers, train_example_idx, model, delta, ma
(216 * delta))))


def _bound_score_hp(hp_values, max_rules, dataset_file, split_name, max_equiv_rules, bound_delta,
def _bound_score_hp(hp_values, max_rules, dataset_file, split_name, max_equiv_rules, rule_blacklist, bound_delta,
bound_max_genome_size, random_generator):
model_type = hp_values[0]
p = hp_values[1]
Expand Down Expand Up @@ -366,6 +368,7 @@ def _tiebreaker(best_utility_idx, rule_risks, model_type):
rule_classifications=rule_classifications,
positive_example_idx=positive_example_idx,
negative_example_idx=negative_example_idx,
rule_blacklist= rule_blacklist,
tiebreaker=tiebreaker,
iteration_callback=iteration_callback,
iteration_rule_importances=True)
Expand All @@ -380,7 +383,7 @@ def _tiebreaker(best_utility_idx, rule_risks, model_type):
return (model_type, p, best_model_length), best_hp_score, best_model, best_rule_importances, best_equivalent_rules


def _bound_selection(dataset_file, split_name, model_types, p_values, max_rules, max_equiv_rules, bound_delta,
def _bound_selection(dataset_file, split_name, model_types, p_values, max_rules, max_equiv_rules, rule_blacklist, bound_delta,
bound_max_genome_size, n_cpu, random_generator, progress_callback, warning_callback,
error_callback):
n_hp_combinations = len(model_types) * len(p_values)
Expand All @@ -389,7 +392,7 @@ def _bound_selection(dataset_file, split_name, model_types, p_values, max_rules,
logging.debug("Using %d CPUs." % n_cpu)
pool = Pool(processes=n_cpu)
hp_eval_func = partial(_bound_score_hp, dataset_file=dataset_file, split_name=split_name, max_rules=max_rules,
max_equiv_rules=max_equiv_rules, bound_delta=bound_delta,
max_equiv_rules=max_equiv_rules, rule_blacklist=rule_blacklist, bound_delta=bound_delta,
bound_max_genome_size=bound_max_genome_size, random_generator=random_generator)

best_hp_score = 1.0
Expand All @@ -413,7 +416,42 @@ def _bound_selection(dataset_file, split_name, model_types, p_values, max_rules,
return best_hp_score, best_hp, best_model, best_rule_importances, best_equiv_rules


def learn(dataset_file, split_name, model_type, p, max_rules, max_equiv_rules, parameter_selection, n_cpu, random_seed,
def _find_rule_blacklist(dataset_file, kmer_blacklist_file, warning_callback):
"""
Finds the index of the rules that must be blacklisted.
"""
dataset = KoverDataset(dataset_file)

# Find all rules to blacklist
rule_blacklist = []
if kmer_blacklist_file is not None:
kmers_to_blacklist = _parse_kmer_blacklist(kmer_blacklist_file, dataset.kmer_length)

if kmers_to_blacklist:
# XXX: the k-mers are upper-cased to avoid not finding a match because of the character case
kmer_sequences = np.array([x.upper() for x in dataset.kmer_sequences]).tolist()
kmer_by_matrix_column = np.array(dataset.kmer_by_matrix_column).tolist() # XXX: each k-mer is there only once (see wiki)
n_kmers = len(kmer_sequences)

kmers_not_found = []
rule_blacklist = []
for k in kmers_to_blacklist:
k = k.upper()
try:
presence_rule_idx = kmer_by_matrix_column.index(kmer_sequences.index(k))
absence_rule_idx = presence_rule_idx + n_kmers
rule_blacklist += [presence_rule_idx, absence_rule_idx]
except ValueError:
kmers_not_found.append(k)

if(len(kmers_not_found) > 0):
warning_callback("The following kmers could not be found in the dataset: " + ", ".join(kmers_not_found))

return rule_blacklist


def learn(dataset_file, split_name, model_type, p, kmer_blacklist_file, max_rules, max_equiv_rules, parameter_selection, n_cpu, random_seed,
bound_delta=None, bound_max_genome_size=None, progress_callback=None, warning_callback=None, error_callback=None):
"""
parameter_selection: bound, cv, none (use first value of each if multiple)
Expand All @@ -436,8 +474,12 @@ def normal_raise(exception):
model_type = np.unique(model_type)
p = np.unique(p)

rule_blacklist = _find_rule_blacklist(dataset_file=dataset_file,
kmer_blacklist_file=kmer_blacklist_file,
warning_callback=warning_callback)

dataset = KoverDataset(dataset_file)

# Score the hyperparameter combinations
# ------------------------------------------------------------------------------------------------------------------
if parameter_selection == "bound":
Expand All @@ -450,16 +492,20 @@ def normal_raise(exception):
best_hp, \
best_model, \
best_rule_importances, \
best_predictor_equiv_rules = _bound_selection(dataset_file, split_name, model_type, p, max_rules,
max_equiv_rules, bound_delta, bound_max_genome_size, n_cpu,
random_generator, progress_callback, warning_callback,
error_callback)
best_predictor_equiv_rules = _bound_selection(dataset_file=dataset_file, split_name=split_name, model_type=model_type,
p_values=p, max_rules=max_rules, max_equiv_rules=max_equiv_rules,
rule_blacklist=rule_blacklist, bound_delta=bound_delta,
bound_max_genome_size=bound_max_genome_size, n_cpu=n_cpu,
random_generator=random_generator, progress_callback=progress_callback,
warning_callback=warning_callback, error_callback=error_callback)
elif parameter_selection == "cv":
n_folds = len(dataset.get_split(split_name).folds)
if n_folds < 1:
error_callback(Exception("Cross-validation cannot be performed on a split with no folds."))
best_hp_score, best_hp = _cross_validation(dataset_file, split_name, model_type, p, max_rules, n_cpu,
progress_callback, warning_callback, error_callback)
best_hp_score, best_hp = _cross_validation(dataset_file=dataset_file, split_name=split_name, model_types=model_type,
p_values=p, max_rules=max_rules, rule_blacklist=rule_blacklist, n_cpu=n_cpu,
progress_callback=progress_callback, warning_callback=warning_callback,
error_callback=error_callback)
else:
# Use the first value provided for each parameter
best_hp = {"model_type": model_type[0], "p": p[0], "max_rules": max_rules}
Expand All @@ -473,9 +519,9 @@ def normal_raise(exception):
rule_importances = best_rule_importances
else:
model, rule_importances, \
equivalent_rules = _full_train(dataset, split_name, best_hp["model_type"], best_hp["p"],
best_hp["max_rules"], max_equiv_rules, random_generator,
progress_callback)
equivalent_rules = _full_train(dataset=dataset, split_name=split_name, model_type=best_hp["model_type"], p=best_hp["p"],
max_rules=best_hp["max_rules"], max_equiv_rules=max_equiv_rules, rule_blacklist=rule_blacklist,
random_generator=random_generator, progress_callback=progress_callback)

split = dataset.get_split(split_name)
train_example_idx = split.train_genome_idx
Expand Down
2 changes: 1 addition & 1 deletion core/kover/learning/set_covering_machine/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def __getitem__(self, idx):
type = "presence"
kmer_idx = self.kmer_by_rule[idx]
return KmerRule(idx % len(self.kmer_sequences), self.kmer_sequences[kmer_idx], type)

def __len__(self):
return self.n_rules

Expand Down
2 changes: 1 addition & 1 deletion core/kover/learning/set_covering_machine/scm.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def fit(self, rules, rule_classifications, positive_example_idx, negative_exampl
rule_blacklist = np.unique(rule_blacklist)
if len(rule_blacklist) == rule_classifications.shape[1]:
raise ValueError("The blacklist cannot include all the rules.")
logging.debug("The following rules are blacklisted and will not be considered:" + str(rule_blacklist))
logging.debug("Blacklisting: {0:d} kmers ({1:d} rules)".format(len(rule_blacklist) / 2, len(rule_blacklist)))

training_example_idx = np.hstack((positive_example_idx, negative_example_idx)) # Needed for rule importances
model_rules_idx = [] # Contains the index of the rules in the model
Expand Down
66 changes: 65 additions & 1 deletion core/kover/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,28 +26,57 @@
def _class_to_string(instance):
"""
Returns a string representation of the public attributes of a class.
Parameters:
-----------
instance: object
An instance of any class.
Returns:
--------
string_rep: string
A string representation of the class and its public attributes.
Notes:
-----
Private attributes must be marked with a leading underscore.
"""
return instance.__class__.__name__ + "(" + ",".join(
[str(k) + "=" + str(v) for k, v in instance.__dict__.iteritems() if str(k[0]) != "_"]) + ")"


def _duplicate_last_element(l, length):
"""
Duplicates the last element of a list until a given length is reached. (In-place)
"""
l += [l[-1]] * (length - len(l))
return l


def _fasta_to_sequences(path):
"""
Reads a FASTA file extracts all the sequences (contigs) that it contains.
"""
contigs = []
buffer = None
for l in open(path, "r"):
if l.startswith(">"):
if buffer is not None:
contigs.append(buffer.upper())
buffer = ""
else:
if buffer is None:
buffer = l.strip()
else:
buffer += l.strip()
if buffer is not None and buffer != "":
contigs.append(buffer.upper())
return contigs


def _hdf5_open_no_chunk_cache(filename, access_type=h.h5f.ACC_RDONLY):
fid = h.h5f.open(filename, access_type)
access_property_list = fid.get_access_plist()
Expand All @@ -59,9 +88,11 @@ def _hdf5_open_no_chunk_cache(filename, access_type=h.h5f.ACC_RDONLY):
file_id = h.h5f.open(filename, access_type, fapl=access_property_list)
return h.File(file_id)


def _minimum_uint_size(max_value):
"""
Find the minimum size unsigned integer type that can store values of at most max_value
"""
if max_value <= np.iinfo(np.uint8).max:
return np.uint8
Expand All @@ -74,9 +105,11 @@ def _minimum_uint_size(max_value):
else:
return np.uint128


def _pack_binary_bytes_to_ints(a, pack_size):
"""
Packs binary values stored in bytes into ints
"""
if pack_size == 64:
type = np.uint64
Expand All @@ -99,9 +132,11 @@ def _pack_binary_bytes_to_ints(a, pack_size):

return b


def _unpack_binary_bytes_from_ints(a):
"""
Unpacks binary values stored in bytes into ints
"""
type = a.dtype

Expand All @@ -127,4 +162,33 @@ def _unpack_binary_bytes_from_ints(a):
b[i] = tmp > 0
packed_rows += 1

return b
return b


def _parse_kmer_blacklist(blacklist_path, expected_kmer_len):
data = []

# Fasta file format
fasta_extensions = [".fasta", ".fa", ".fas", ".fna"]
if any(blacklist_path.endswith(extension) for extension in fasta_extensions):
data = _fasta_to_sequences(blacklist_path)

# Other file format (text file with one kmer per line)
else:
# Loading data and splitting on every line
data = [l.rstrip('\n') for l in open(blacklist_path, "r")]

# Filtering for empty strings (empty lines)
data = [x for x in data if x]

def is_valid_kmer(k):
return len(set(k).difference(["A", "C", "G", "T", "a", "c", "g", "t"])) == 0
for kmer in data:
if not is_valid_kmer(kmer):
raise ValueError("{} is not a valid DNA sequence".format(kmer))

if not(all(len(kmer) == expected_kmer_len for kmer in data)):
raise ValueError("Extracted k-mers to blacklist do not have all the same length as the dataset k-mers")

return data

2 changes: 1 addition & 1 deletion core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def finalize_options(self):

setup(
name = "kover",
version = "1.2.2",
version = "1.3.0",
packages = find_packages(),

cmdclass={'build_ext':build_ext},
Expand Down
Loading

0 comments on commit 9aedb97

Please sign in to comment.