diff --git a/doc/source/pages/analysis.rst b/doc/source/pages/analysis.rst index 85659f2..2611f15 100644 --- a/doc/source/pages/analysis.rst +++ b/doc/source/pages/analysis.rst @@ -1 +1,7 @@ .. automodule:: pbxplore.analysis + +K-means +------- + +.. automodule:: pbxplore.analysis.kmeans + :members: diff --git a/pbxplore/analysis/kmeans.py b/pbxplore/analysis/kmeans.py new file mode 100644 index 0000000..66ea009 --- /dev/null +++ b/pbxplore/analysis/kmeans.py @@ -0,0 +1,255 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +from __future__ import print_function, absolute_import + +# Imports for the clustering +import random +import sys +import numpy +import collections +import copy + +from ..PB import NAMES + +def count_per_position_partial(sequences, seq_indices): + """ + Count the population of each block at each position. + + Parameters + ---------- + sequences + a list of sequences + seq_indices + a list of indices; only the sequences from `sequences` + which have their index in `seq_indices` will be taken into + account. + + Returns + ------- + counts: collections.defaultdict + a defaultdict with the blocks as keys and their population as + values; the default value is a 0 interger. + """ + counts = [collections.defaultdict(int) + for _ in range(len(sequences[0]))] + for seq_idx in seq_indices: + seq = sequences[seq_idx] + for block, count in zip(seq, counts): + count[block] += 1 + return counts + + +def make_profile_partial(sequences, seq_indices): + """ + Calculate a frequency profile from a list of sequences. + + A profile is the frequency for each block to be a each position. + + Parameters + ---------- + sequences + a list of sequences + seq_indices + a list of indices; only the sequences from `sequences` + which have their index in `seq_indices` will be taken into + account. + + Returns + ------- + profile: numpy array + the frequency profile as a numpy array with a row for each + block and a column for each position. + """ + counts = count_per_position_partial(sequences, seq_indices) + profile = numpy.zeros((16, len(counts))) + for pos_idx, position in enumerate(counts): + for block_idx, block in enumerate(NAMES): + profile[block_idx, pos_idx] = position[block] + profile /= len(seq_indices) + return profile + + +def compatibility(profile, sequence): + """ + Compute the compatibility of a sequence with a profile. + + Parameters + ---------- + profile + a frequency profile as a numpy array with a row for each + block and a column for each position + sequence: the block sequence to test as a string + + Returns + ------- + probability: float + cummulative probability of the given sequence given the profile + """ + probabilities = numpy.zeros((profile.shape[1], )) + for pos_idx, block in enumerate(sequence): + block_idx = NAMES.find(block) + probabilities[pos_idx] = profile[block_idx, pos_idx] + return numpy.sum(probabilities) + + +def _argmax(values): + """ + Return the index of the maximum value of a sequence + """ + iter_values = iter(values) + argmax = 0 + valmax = next(iter_values) + for arg, val in enumerate(iter_values, start=1): + if val > valmax: + valmax = val + argmax = arg + return argmax + + +def attribute_center(centers, sequences): + """ + Assign the more compatible center to each sequence. + + For a list of frequency profiles and a list of sequences, assign to + each sequence the more compatible frequency profile. + + Parameters + ---------- + centers + a list of frequency profiles, each profile is a numpy array + with rows for the blocks and collumns for the positions + sequences: a list of block sequences + + Returns + ------- + groups: list + a list of the profile index for each sequence + """ + groups = [0 for _ in range(len(sequences))] + for seq_idx, sequence in enumerate(sequences): + compatibilities = [compatibility(center, sequence) + for center in centers] + groups[seq_idx] = _argmax(compatibilities) + return groups + + +def update_centers(sequences, groups, ngroups): + """ + Calculate the frequency profile for each group + + Parameters + ---------- + sequences + a list of block sequences + groups + a list with a group identifier for each sequence + ngroups + the number of groups; this number cannot be obtained with + enough confidence from the `groups` list, indeed some group can + be empty and therefore not appearing in the `group` list + + Returns + ------- + centers: numpy array + a frequency profile for each group + """ + grouped_indices = [[] for _ in range(ngroups)] + for seq_idx, group_idx in enumerate(groups): + grouped_indices[group_idx].append(seq_idx) + centers = [] + for group in grouped_indices: + centers.append(make_profile_partial(sequences, group)) + return centers + + +def get_medoids(groups): + raise NotImplemented + + +def initial_centers(sequences, ngroups): + """ + Create single sequence frequency profiles from randomly chosen sequences + + Choose `ngroups` sequences and build a single sequence frequency + profile for each of them. These profiles aims at being initial centers + for the clustering. + + Parameters + ---------- + sequences + a list of block sequences + ngroups + the number of profiles to build + + Returns + ------- + centers: list + a list of frequency profiles + """ + centers = [] + sample = random.sample(range(len(sequences)), ngroups) + assert len(sample) == len(set(sample)), \ + 'Redundances in the initial sampling' + assert len([sequences[i] for i in sample]) \ + == len(set([sequences[i] for i in sample])), \ + 'Redundances in the initial sampling' + for sequence_idx in sample: + centers.append(make_profile_partial(sequences, [sequence_idx, ])) + return centers + + +def _is_converged(old_centers, new_centers): + """ + Returns True is all `new_centers` are the same as the `old_centers` + """ + for old, new in zip(old_centers, new_centers): + if not numpy.all(old == new): + return False + return True + +def k_means(sequences, ngroups, max_iter, logfile=sys.stdout): + """ + Carry out a K-means clustering on block sequences + + Cluster the sequences into `ngroups` groups. The centers of each + group are computed as a frequency profile for the sequences in the + group. The distance metrix used to assign a center to the sequences + is the probability to obtain a sequence from a given frequency profile. + + Parameters + ---------- + sequences + a list of block sequences to cluster + ngroups + the number of cluster to build + max_iter + the maximum number of iterations to run + logfile + a file descriptor where to write logs, stdout by default + + Returns + ------- + groups: list + a list of cluster identifier for each sequence + convergeance: bool + True if the clustering converged, else False + """ + convergeance = False + centers = initial_centers(sequences, ngroups) + for iteration in range(1, max_iter + 1): + groups = attribute_center(centers, sequences) + print(iteration, collections.Counter(groups), len(centers), + file=logfile) + new_centers = update_centers(sequences, groups, ngroups) + if _is_converged(centers, new_centers): + print('Convergence reached in {} iterations' + .format(iteration), file=logfile) + convergeance = True + break + centers = copy.copy(new_centers) + else: + print(('K-means reached {} iterations before ' + 'reaching convergence.').format(max_iter), + file=logfile) + return groups, centers, convergeance diff --git a/pbxplore/scripts/PBkmeans.py b/pbxplore/scripts/PBkmeans.py new file mode 100644 index 0000000..5f113cc --- /dev/null +++ b/pbxplore/scripts/PBkmeans.py @@ -0,0 +1,93 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Read PDB structures and assign protein blocs (PBs). + +2013 - P. Poulain, A. G. de Brevern +""" + + +# Use print as a function for python 3 compatibility +from __future__ import print_function + +import argparse +import os +import sys + +import pbxplore as pbx +from pbxplore.analysis import kmeans +from pbxplore.scripts import PBclust + +try: + import builtins +except ImportError: + import itertools + zip = itertools.izip +else: + del builtins + + +def write_kmeans_clusters(fname, headers, groups): + with open(fname, 'w') as infile: + for header, group in zip(headers, groups): + print('{0}\t{1}'.format(header, group), file=infile) + + +def user_input(): + """ + Handle PBkmeans command line arguments + """ + parser = argparse.ArgumentParser( + description=("Cluster protein structures based " + "on their PB sequences using the k-means " + "algorithm.")) + + # mandatory arguments + parser.add_argument("-f", action="append", required=True, + help="name(s) of the PBs file (in fasta format)") + parser.add_argument("-o", action="store", required=True, + help="name for results") + parser.add_argument("--clusters", action="store", type=int, required=True, + help="number of wanted clusters") + + # options + parser.add_argument("--max-iter", dest='max_iter', type=int, default=100, + help="maximum number of iterations") + + # get all arguments + options = parser.parse_args() + + # test if the number of clusters is valid + if options.clusters <= 0: + parser.error("Number of clusters must be > 0.") + + # check if input files exist + for name in options.f: + if not os.path.isfile(name): + parse.error("{0}: not a valid file. Bye".format(name)) + + return options + + +def pbkmeans_cli(): + # Read user inputs + options = user_input() + header_lst, seq_lst = pbx.io.read_several_fasta(options.f) + + # Do the clustering + groups, _, convergeance = kmeans.k_means(seq_lst, + ngroups=options.clusters, + max_iter=options.max_iter) + + # Write the output + output_fname = options.o + ".PB.kmeans" + write_kmeans_clusters(output_fname, header_lst, seq_lst) + + # Write the report + PBclust.display_clust_report(groups) + + return 0 + +if __name__ =='__main__': + sys.exit(pbkmeans_cli()) diff --git a/pbxplore/test/test_functions.py b/pbxplore/test/test_functions.py index 8b4f138..14adac8 100644 --- a/pbxplore/test/test_functions.py +++ b/pbxplore/test/test_functions.py @@ -16,8 +16,11 @@ import collections import os +import numpy + import pbxplore as pbx from pbxplore.structure import structure +from pbxplore.analysis import kmeans here = os.path.abspath(os.path.dirname(__file__)) @@ -174,5 +177,130 @@ def test_read_fasta(self): 'mmmmmnoopacddddddehkllmmmmngoilmmmm' 'mmmmmmmmnopacdcddZZ']) + +class TestKMeans(unittest.TestCase): + """ + Test individual functions for K-Means clustering + """ + + def test_count_per_position_partial(self): + sequences = ['abcdef', 'bcdefg', 'cdefgh', + 'defghi', 'efghij', 'fghijk', # ignore in the test + 'ghijkl', 'hijklm', 'ijklmn', + 'ghijkl', 'hijklm', 'ijklmn', + 'jklmno', 'klmnop', ] # ignore in the test + indices = [0, 1, 2, 6, 7, 8, 9, 10, 11] + + # Shorten defaultdict call to have a nicer table + dd = collections.defaultdict + # Create a list with the right number of elements. The type of each + # element does not matter as the elements will be overwritten + ref_count = ['' for _ in range(6)] + # Fill the reference. Each element of the list corresponds to a + # position in the sequences. Only the sequences that have their index + # in the ``indices`` list will be used in the count. + ref_count[0] = dd(int, {'a': 1, 'b': 1, 'c': 1, 'g': 2, 'h': 2, 'i': 2}) + ref_count[1] = dd(int, {'b': 1, 'c': 1, 'd': 1, 'h': 2, 'i': 2, 'j': 2}) + ref_count[2] = dd(int, {'c': 1, 'd': 1, 'e': 1, 'i': 2, 'j': 2, 'k': 2}) + ref_count[3] = dd(int, {'d': 1, 'e': 1, 'f': 1, 'j': 2, 'k': 2, 'l': 2}) + ref_count[4] = dd(int, {'e': 1, 'f': 1, 'g': 1, 'k': 2, 'l': 2, 'm': 2}) + ref_count[5] = dd(int, {'f': 1, 'g': 1, 'h': 1, 'l': 2, 'm': 2, 'n': 2}) + + count = kmeans.count_per_position_partial(sequences, indices) + self.assertEqual(count, ref_count) + + def test_make_profile_partial(self): + sequences = ['abcdef', 'bcdefg', 'cdefgh', + 'defghi', 'efghij', 'fghijk', # ignore in the test + 'ghijkl', 'hijklm', 'ijklmn', + 'ghijkl', 'hijklm', 'ijklmn', + 'jklmno', 'klmnop', # ignore in the test + 'ijklmn'] + # Using 10 sequences makes things easier + indices = [0, 1, 2, 6, 7, 8, 9, 10, 11, 14] + ref_profile = numpy.array([[0.1, 0.0, 0.0, 0.0, 0.0, 0.0], # a + [0.1, 0.1, 0.0, 0.0, 0.0, 0.0], # b + [0.1, 0.1, 0.1, 0.0, 0.0, 0.0], # c + [0.0, 0.1, 0.1, 0.1, 0.0, 0.0], # d + [0.0, 0.0, 0.1, 0.1, 0.1, 0.0], # e + [0.0, 0.0, 0.0, 0.1, 0.1, 0.1], # f + [0.2, 0.0, 0.0, 0.0, 0.1, 0.1], # g + [0.2, 0.2, 0.0, 0.0, 0.0, 0.1], # h + [0.3, 0.2, 0.2, 0.0, 0.0, 0.0], # i + [0.0, 0.3, 0.2, 0.2, 0.0, 0.0], # j + [0.0, 0.0, 0.3, 0.2, 0.2, 0.0], # k + [0.0, 0.0, 0.0, 0.3, 0.2, 0.2], # l + [0.0, 0.0, 0.0, 0.0, 0.3, 0.2], # m + [0.0, 0.0, 0.0, 0.0, 0.0, 0.3], # n + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # o + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]) # p + profile = kmeans.make_profile_partial(sequences, indices) + assert(numpy.allclose(ref_profile, profile)) + + def test_compatibility_identity(self): + profile = numpy.array([[1.0, 0.0, 0.0, 0.0], # a + [0.0, 1.0, 0.0, 0.0], # b + [0.0, 0.0, 1.0, 0.0], # c + [0.0, 0.0, 0.0, 1.0], # d + [0.0, 0.0, 0.0, 0.0], # e + [0.0, 0.0, 0.0, 0.0], # f + [0.0, 0.0, 0.0, 0.0], # g + [0.0, 0.0, 0.0, 0.0], # h + [0.0, 0.0, 0.0, 0.0], # i + [0.0, 0.0, 0.0, 0.0], # j + [0.0, 0.0, 0.0, 0.0], # k + [0.0, 0.0, 0.0, 0.0], # l + [0.0, 0.0, 0.0, 0.0], # m + [0.0, 0.0, 0.0, 0.0], # n + [0.0, 0.0, 0.0, 0.0], # o + [0.0, 0.0, 0.0, 0.0]]) # p + sequence = 'abcd' + reference_compatibility = len(sequence) + compatibility = kmeans.compatibility(profile, sequence) + self.assertAlmostEqual(compatibility, reference_compatibility) + + def test_compatibility(self): + # The profile correspond to the following sequences: + # - abcd - ioph - alep + # - jiop - fgad - hobe + # - abjp - piep + # - hbno - jojo + profile = numpy.array([[0.3, 0.0, 0.1, 0.0], # a + [0.0, 0.3, 0.1, 0.0], # b + [0.0, 0.0, 0.1, 0.0], # c + [0.0, 0.0, 0.0, 0.2], # d + [0.0, 0.1, 0.2, 0.1], # e + [0.1, 0.0, 0.0, 0.0], # f + [0.0, 0.1, 0.0, 0.0], # g + [0.2, 0.0, 0.0, 0.1], # h + [0.1, 0.2, 0.0, 0.0], # i + [0.2, 0.0, 0.2, 0.0], # j + [0.0, 0.0, 0.0, 0.0], # k + [0.0, 0.1, 0.0, 0.0], # l + [0.0, 0.0, 0.0, 0.0], # m + [0.0, 0.0, 0.1, 0.0], # n + [0.0, 0.2, 0.1, 0.2], # o + [0.1, 0.0, 0.1, 0.4]]) # p + print(profile.sum(axis=0)) + assert(numpy.allclose(profile.sum(axis=0), + numpy.ones((profile.shape[1],), + dtype=profile.dtype))) + reference_sequences = (('abcd', 0.9), ('hipo', 0.7), ('bcda', 0.0), + ('aaaa', 0.4), ('kkkk', 0.0), ('oooo', 0.5),) + for sequence, reference_compatibility in reference_sequences: + compatibility = kmeans.compatibility(profile, sequence) + self.assertAlmostEqual(compatibility, reference_compatibility) + + def test_argmax(self): + reference = (([0, 1, 2, 3, 4], 4), # Ordered + ([4, 3, 2, 1, 0], 0), # Reverse ordered + ([1, 3, 2, 4, 0], 3), # Random order + ([0, 0, 3, 4, 4], 3), # Duplicates + ([4, 4, 4, 4, 4], 0), # All the same + ) + for test_case, expectation in reference: + self.assertEqual(kmeans._argmax(test_case), expectation) + + if __name__ == '__main__': unittest.main() diff --git a/setup.py b/setup.py index e30c176..0b338d8 100644 --- a/setup.py +++ b/setup.py @@ -73,6 +73,7 @@ 'PBclust = pbxplore.scripts.PBclust:pbclust_cli', 'PBcount = pbxplore.scripts.PBcount:pbcount_cli', 'PBstat = pbxplore.scripts.PBstat:pbstat_cli', + 'PBkmeans = pbxplore.scripts.PBkmeans:pbkmeans_cli', 'PBdata = pbxplore.scripts.PBdata:pbdata_cli', ], },