diff --git a/CHANGELOG b/CHANGELOG index 90bde99f..405b5bc7 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,9 @@ +v0.10.2 (2019-01-30) + +- new option to import a "states" file format, a bed file with categorical data, e.g. from chromHMM to multivec format. +- while converting a bed file to multivec, each segment can now be a multiple of base_resolution, +rather than exactly match the base_resolution + v0.10.1 (2019-01-22) - Removed a buggy print statement from the conversion script diff --git a/clodius/cli/convert.py b/clodius/cli/convert.py index 4bc71f53..d038bf03 100644 --- a/clodius/cli/convert.py +++ b/clodius/cli/convert.py @@ -17,13 +17,13 @@ def epilogos_bedline_to_vector(bedline): ''' Convert a line from an epilogos bedfile to vector format. - + Parameters ----------- parts: [string,....] A line from a bedfile broken up into its constituent parts (e.g. ["chr1", "1000", "2000", "[1,2,34,5]"]) - + Returns ------- An array containing the values associated with that line @@ -39,9 +39,40 @@ def epilogos_bedline_to_vector(bedline): chrom=parts[0] start=int(parts[1]) end=int(parts[2]) - + return (chrom, start, end, states) +def states_bedline_to_vector(bedline,states_dic): + ''' + Convert a line from a bedfile containing states in categorical data to vector format. + + Parameters + ---------- + + parts: [string,...] + A line form a bedfile broken up into its contituent parts + (e.g. ["chr1", "1000", "2000", "state"])) + + + states_dic: + A dictionary containing the states in the file with a corresponding value + + Returns + ------- + + An array containing values associated the state + + ''' + parts = bedline.decode('utf8').strip().split('\t') + chrom=parts[0] + start=int(parts[1]) + end=int(parts[2]) + state= states_dic[parts[3].encode('utf8')] + + states_vector = [ 1 if index == state else 0 for index in range(len(states_dic))] + + return (chrom, start, end, states_vector) + @cli.group() def convert(): ''' @@ -82,6 +113,7 @@ def _bedgraph_to_multivec( if row_infos_filename is not None: with open(row_infos_filename, 'r') as fr: row_infos = [l.strip().encode('utf8') for l in fr] + else: row_infos = None @@ -101,10 +133,16 @@ def bedline_to_chrom_start_end_vector(bedline): return (chrom, start, end, vector) if format == 'epilogos': - cmv.bedfile_to_multivec(filepath, f_out, epilogos_bedline_to_vector, + cmv.bedfile_to_multivec(filepath, f_out, epilogos_bedline_to_vector, starting_resolution, has_header, chunk_size); + elif format == 'states': + assert(row_infos != None), "A row_infos file must be provided for --format = 'states' " + states_dic = {row_infos[x]:x for x in range(len(row_infos))} + + cmv.bedfile_to_multivec(filepath, f_out, states_bedline_to_vector, + starting_resolution, has_header, chunk_size, states_dic); else: - cmv.bedfile_to_multivec(filepath, f_out, bedline_to_chrom_start_end_vector, + cmv.bedfile_to_multivec(filepath, f_out, bedline_to_chrom_start_end_vector, starting_resolution, has_header, chunk_size); f_out.close() @@ -124,7 +162,7 @@ def agg(x): # newshape = (x.shape[2], -1, 2) # b = x.T.reshape((-1,)) - + a = x.T.reshape((x.shape[1],-1,2)) # this is going to be an odd way to get rid of nan @@ -153,7 +191,7 @@ def agg(x): else: agg=lambda x: x.T.reshape((x.shape[1],-1,2)).sum(axis=2).T - cmv.create_multivec_multires(f_in, + cmv.create_multivec_multires(f_in, chromsizes = zip(chrom_names, chrom_sizes), agg=agg, starting_resolution=starting_resolution, @@ -242,7 +280,10 @@ def agg(x): ) @click.option( '--format', - type=click.Choice(['default', 'epilogos']), + type=click.Choice(['default', 'epilogos', 'states']), + help= "'default':chr start end state1_value state2_value, etc;" + "'epilogos': chr start end [[state1_value, state1_num],[state2_value, state2_num],[etc]];" + "'states': chr start end state_name", default='default' ) @click.option( @@ -262,15 +303,14 @@ def agg(x): type=click.Choice(['sum', 'logsumexp']), default='sum' ) -def bedfile_to_multivec(filepath, output_file, assembly, chromosome_col, - from_pos_col, to_pos_col, value_col, has_header, +def bedfile_to_multivec(filepath, output_file, assembly, chromosome_col, + from_pos_col, to_pos_col, value_col, has_header, chunk_size, nan_value, chromsizes_filename, - starting_resolution, num_rows, + starting_resolution, num_rows, format, row_infos_filename, tile_size, method): - _bedgraph_to_multivec(filepath, output_file, assembly, chromosome_col, - from_pos_col, to_pos_col, value_col, has_header, - chunk_size, nan_value, - chromsizes_filename, starting_resolution, num_rows, + _bedgraph_to_multivec(filepath, output_file, assembly, chromosome_col, + from_pos_col, to_pos_col, value_col, has_header, + chunk_size, nan_value, + chromsizes_filename, starting_resolution, num_rows, format, row_infos_filename, tile_size, method) - diff --git a/clodius/multivec.py b/clodius/multivec.py index 0eb7c96c..145adb71 100644 --- a/clodius/multivec.py +++ b/clodius/multivec.py @@ -8,9 +8,9 @@ import os.path as op import sys -def bedfile_to_multivec(input_filename, f_out, +def bedfile_to_multivec(input_filename, f_out, bedline_to_chrom_start_end_vector, base_resolution, - has_header, chunk_size): + has_header, chunk_size, row_infos): ''' Convert an epilogos bedfile to multivec format. ''' @@ -18,7 +18,7 @@ def bedfile_to_multivec(input_filename, f_out, f = gzip.open(input_filename, 'r') else: f = open(input_filename, 'r') - + FILL_VALUE = np.nan # batch regions because h5py is really bad at writing @@ -29,7 +29,7 @@ def bedfile_to_multivec(input_filename, f_out, curr_index = 0 # the start of the batch in the dataset batch_start_index = 0 - + if has_header: f.readline() @@ -38,11 +38,11 @@ def bedfile_to_multivec(input_filename, f_out, warned = False for line in f: - chrom,start,end,vector = bedline_to_chrom_start_end_vector(line) + chrom,start,end,vector = bedline_to_chrom_start_end_vector(line, row_infos) - if end - start != base_resolution and not warned: - print("WARNING: interval length ({}) doesn't match base resolution ({}): {}". - format(end - start, base_resolution, line)) + if end % base_resolution != 0 or start % base_resolution != 0 and not warned: + print("WARNING: either the start or end coordinate is not a multiple of the base resolution ({}): {}". + format(base_resolution, line)) warned = True if prev_chrom is not None and chrom != prev_chrom: @@ -50,7 +50,7 @@ def bedfile_to_multivec(input_filename, f_out, # the previous values print("len(batch:", len(batch)) f_out[prev_chrom][batch_start_index:batch_start_index+len(batch)] = np.array(batch) - + # we're starting a new chromosome so we start from the beginning curr_index = 0 batch_start_index = 0 @@ -77,8 +77,15 @@ def bedfile_to_multivec(input_filename, f_out, assert(curr_index == data_start_index) #print('vector', vector) - batch += [vector] - curr_index += 1 + + #When the binsize is not equal to the base_resolution + # "break down" the binsize into bins of the rbase_esolution size + #and add the values to each bin. + + data_end_index = end // base_resolution + while curr_index < data_end_index: + batch += [vector] + curr_index += 1 # fill in empty @@ -98,14 +105,14 @@ def bedfile_to_multivec(input_filename, f_out, #print('chrom', chrom) f_out[chrom][batch_start_index:batch_start_index+len(batch)] = np.array(batch) -def create_multivec_multires(array_data, chromsizes, +def create_multivec_multires(array_data, chromsizes, agg, starting_resolution=1, tile_size=1024, output_file='/tmp/my_file.multires', row_infos=None): ''' Create a multires file containing the array data aggregated at multiple resolutions. - + Parameters ---------- array_data: {'chrom_key': np.array, } @@ -131,11 +138,11 @@ def create_multivec_multires(array_data, chromsizes, # this will be the file that contains our multires data f = h5py.File(filename, 'w') - + # store some metadata f.create_group('info') f['info'].attrs['tile-size'] = tile_size - + f.create_group('resolutions') f.create_group('chroms') @@ -184,7 +191,7 @@ def create_multivec_multires(array_data, chromsizes, while start < len(chrom_data): chrom_data[start:start + chunk_size] = array_data[chrom][start:start+chunk_size] # see above section start += int(min(standard_chunk_size, len(array_data[chrom]) - start)) - + # the maximum zoom level corresponds to the number of aggregations # that need to be performed so that the entire extent of @@ -192,7 +199,7 @@ def create_multivec_multires(array_data, chromsizes, total_length = sum(lengths) # print("total_length:", total_length, "tile_size:", tile_size, "starting_resolution:", starting_resolution) max_zoom = math.ceil(math.log(total_length / (tile_size * starting_resolution) ) / math.log(2)) - + # we're going to go through and create the data for the different # zoom levels by summing adjacent data points prev_resolution = curr_resolution @@ -230,7 +237,7 @@ def create_multivec_multires(array_data, chromsizes, new_shape[0] = math.ceil(new_shape[0] / 2) new_shape = tuple(new_shape) - f['resolutions'][str(curr_resolution)]['values'].create_dataset(chrom, + f['resolutions'][str(curr_resolution)]['values'].create_dataset(chrom, new_shape, compression='gzip') while start < len(chrom_data): @@ -239,7 +246,7 @@ def create_multivec_multires(array_data, chromsizes, #print("prev_resolution:", prev_resolution) #print("old_data.shape", old_data.shape) - # this is a sort of roundabout way of calculating the + # this is a sort of roundabout way of calculating the # shape of the aggregated array, but all its doing is # just halving the first dimension of the previous shape # without taking into account the other dimensions @@ -258,8 +265,8 @@ def create_multivec_multires(array_data, chromsizes, new_data = agg(old_data) ''' - print("zoom_level:", max_zoom - 1 - i, - "resolution:", curr_resolution, + print("zoom_level:", max_zoom - 1 - i, + "resolution:", curr_resolution, "new_data length", len(new_data)) ''' f['resolutions'][str(curr_resolution)]['values'][chrom][int(start/2):int(start/2+chunk_size/2)] = new_data diff --git a/test/sample_data/states_format_input_testfile.bed.gz b/test/sample_data/states_format_input_testfile.bed.gz new file mode 100644 index 00000000..2a49aab1 Binary files /dev/null and b/test/sample_data/states_format_input_testfile.bed.gz differ diff --git a/test/sample_data/states_format_output_test_file.multires.mv5 b/test/sample_data/states_format_output_test_file.multires.mv5 new file mode 100644 index 00000000..6792393f Binary files /dev/null and b/test/sample_data/states_format_output_test_file.multires.mv5 differ diff --git a/test/sample_data/states_format_test_row_infos.txt b/test/sample_data/states_format_test_row_infos.txt new file mode 100644 index 00000000..d2dcfc03 --- /dev/null +++ b/test/sample_data/states_format_test_row_infos.txt @@ -0,0 +1,10 @@ +Quies +FaireW +Low +Pol2 +Gen3' +Elon +Ctcf +EnhW +EnhWF +ElonW \ No newline at end of file