Skip to content

Commit

Permalink
Merge branch 'develop' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
Burak Han Alver authored Jan 30, 2019
2 parents c076788 + f6980c1 commit c119237
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 37 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
v0.10.2 (2019-01-30)

- new option to import a "states" file format, a bed file with categorical data, e.g. from chromHMM to multivec format.
- while converting a bed file to multivec, each segment can now be a multiple of base_resolution,
rather than exactly match the base_resolution

v0.10.1 (2019-01-22)

- Removed a buggy print statement from the conversion script
Expand Down
72 changes: 56 additions & 16 deletions clodius/cli/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@
def epilogos_bedline_to_vector(bedline):
'''
Convert a line from an epilogos bedfile to vector format.
Parameters
-----------
parts: [string,....]
A line from a bedfile broken up into its constituent parts
(e.g. ["chr1", "1000", "2000", "[1,2,34,5]"])
Returns
-------
An array containing the values associated with that line
Expand All @@ -39,9 +39,40 @@ def epilogos_bedline_to_vector(bedline):
chrom=parts[0]
start=int(parts[1])
end=int(parts[2])

return (chrom, start, end, states)

def states_bedline_to_vector(bedline,states_dic):
'''
Convert a line from a bedfile containing states in categorical data to vector format.
Parameters
----------
parts: [string,...]
A line form a bedfile broken up into its contituent parts
(e.g. ["chr1", "1000", "2000", "state"]))
states_dic:
A dictionary containing the states in the file with a corresponding value
Returns
-------
An array containing values associated the state
'''
parts = bedline.decode('utf8').strip().split('\t')
chrom=parts[0]
start=int(parts[1])
end=int(parts[2])
state= states_dic[parts[3].encode('utf8')]

states_vector = [ 1 if index == state else 0 for index in range(len(states_dic))]

return (chrom, start, end, states_vector)

@cli.group()
def convert():
'''
Expand Down Expand Up @@ -82,6 +113,7 @@ def _bedgraph_to_multivec(
if row_infos_filename is not None:
with open(row_infos_filename, 'r') as fr:
row_infos = [l.strip().encode('utf8') for l in fr]

else:
row_infos = None

Expand All @@ -101,10 +133,16 @@ def bedline_to_chrom_start_end_vector(bedline):
return (chrom, start, end, vector)

if format == 'epilogos':
cmv.bedfile_to_multivec(filepath, f_out, epilogos_bedline_to_vector,
cmv.bedfile_to_multivec(filepath, f_out, epilogos_bedline_to_vector,
starting_resolution, has_header, chunk_size);
elif format == 'states':
assert(row_infos != None), "A row_infos file must be provided for --format = 'states' "
states_dic = {row_infos[x]:x for x in range(len(row_infos))}

cmv.bedfile_to_multivec(filepath, f_out, states_bedline_to_vector,
starting_resolution, has_header, chunk_size, states_dic);
else:
cmv.bedfile_to_multivec(filepath, f_out, bedline_to_chrom_start_end_vector,
cmv.bedfile_to_multivec(filepath, f_out, bedline_to_chrom_start_end_vector,
starting_resolution, has_header, chunk_size);

f_out.close()
Expand All @@ -124,7 +162,7 @@ def agg(x):
# newshape = (x.shape[2], -1, 2)
# b = x.T.reshape((-1,))


a = x.T.reshape((x.shape[1],-1,2))

# this is going to be an odd way to get rid of nan
Expand Down Expand Up @@ -153,7 +191,7 @@ def agg(x):
else:
agg=lambda x: x.T.reshape((x.shape[1],-1,2)).sum(axis=2).T

cmv.create_multivec_multires(f_in,
cmv.create_multivec_multires(f_in,
chromsizes = zip(chrom_names, chrom_sizes),
agg=agg,
starting_resolution=starting_resolution,
Expand Down Expand Up @@ -242,7 +280,10 @@ def agg(x):
)
@click.option(
'--format',
type=click.Choice(['default', 'epilogos']),
type=click.Choice(['default', 'epilogos', 'states']),
help= "'default':chr start end state1_value state2_value, etc;"
"'epilogos': chr start end [[state1_value, state1_num],[state2_value, state2_num],[etc]];"
"'states': chr start end state_name",
default='default'
)
@click.option(
Expand All @@ -262,15 +303,14 @@ def agg(x):
type=click.Choice(['sum', 'logsumexp']),
default='sum'
)
def bedfile_to_multivec(filepath, output_file, assembly, chromosome_col,
from_pos_col, to_pos_col, value_col, has_header,
def bedfile_to_multivec(filepath, output_file, assembly, chromosome_col,
from_pos_col, to_pos_col, value_col, has_header,
chunk_size, nan_value,
chromsizes_filename,
starting_resolution, num_rows,
starting_resolution, num_rows,
format, row_infos_filename, tile_size, method):
_bedgraph_to_multivec(filepath, output_file, assembly, chromosome_col,
from_pos_col, to_pos_col, value_col, has_header,
chunk_size, nan_value,
chromsizes_filename, starting_resolution, num_rows,
_bedgraph_to_multivec(filepath, output_file, assembly, chromosome_col,
from_pos_col, to_pos_col, value_col, has_header,
chunk_size, nan_value,
chromsizes_filename, starting_resolution, num_rows,
format, row_infos_filename, tile_size, method)

49 changes: 28 additions & 21 deletions clodius/multivec.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@
import os.path as op
import sys

def bedfile_to_multivec(input_filename, f_out,
def bedfile_to_multivec(input_filename, f_out,
bedline_to_chrom_start_end_vector, base_resolution,
has_header, chunk_size):
has_header, chunk_size, row_infos):
'''
Convert an epilogos bedfile to multivec format.
'''
if op.splitext(input_filename)[1] == '.gz':
f = gzip.open(input_filename, 'r')
else:
f = open(input_filename, 'r')

FILL_VALUE = np.nan

# batch regions because h5py is really bad at writing
Expand All @@ -29,7 +29,7 @@ def bedfile_to_multivec(input_filename, f_out,
curr_index = 0
# the start of the batch in the dataset
batch_start_index = 0

if has_header:
f.readline()

Expand All @@ -38,19 +38,19 @@ def bedfile_to_multivec(input_filename, f_out,
warned = False

for line in f:
chrom,start,end,vector = bedline_to_chrom_start_end_vector(line)
chrom,start,end,vector = bedline_to_chrom_start_end_vector(line, row_infos)

if end - start != base_resolution and not warned:
print("WARNING: interval length ({}) doesn't match base resolution ({}): {}".
format(end - start, base_resolution, line))
if end % base_resolution != 0 or start % base_resolution != 0 and not warned:
print("WARNING: either the start or end coordinate is not a multiple of the base resolution ({}): {}".
format(base_resolution, line))
warned = True

if prev_chrom is not None and chrom != prev_chrom:
# we've reached a new chromosome so we'll dump all
# the previous values
print("len(batch:", len(batch))
f_out[prev_chrom][batch_start_index:batch_start_index+len(batch)] = np.array(batch)

# we're starting a new chromosome so we start from the beginning
curr_index = 0
batch_start_index = 0
Expand All @@ -77,8 +77,15 @@ def bedfile_to_multivec(input_filename, f_out,

assert(curr_index == data_start_index)
#print('vector', vector)
batch += [vector]
curr_index += 1

#When the binsize is not equal to the base_resolution
# "break down" the binsize into bins of the rbase_esolution size
#and add the values to each bin.

data_end_index = end // base_resolution
while curr_index < data_end_index:
batch += [vector]
curr_index += 1

# fill in empty

Expand All @@ -98,14 +105,14 @@ def bedfile_to_multivec(input_filename, f_out,
#print('chrom', chrom)
f_out[chrom][batch_start_index:batch_start_index+len(batch)] = np.array(batch)

def create_multivec_multires(array_data, chromsizes,
def create_multivec_multires(array_data, chromsizes,
agg, starting_resolution=1,
tile_size=1024, output_file='/tmp/my_file.multires',
row_infos=None):
'''
Create a multires file containing the array data
aggregated at multiple resolutions.
Parameters
----------
array_data: {'chrom_key': np.array, }
Expand All @@ -131,11 +138,11 @@ def create_multivec_multires(array_data, chromsizes,

# this will be the file that contains our multires data
f = h5py.File(filename, 'w')

# store some metadata
f.create_group('info')
f['info'].attrs['tile-size'] = tile_size

f.create_group('resolutions')
f.create_group('chroms')

Expand Down Expand Up @@ -184,15 +191,15 @@ def create_multivec_multires(array_data, chromsizes,
while start < len(chrom_data):
chrom_data[start:start + chunk_size] = array_data[chrom][start:start+chunk_size] # see above section
start += int(min(standard_chunk_size, len(array_data[chrom]) - start))


# the maximum zoom level corresponds to the number of aggregations
# that need to be performed so that the entire extent of
# the dataset fits into one tile
total_length = sum(lengths)
# print("total_length:", total_length, "tile_size:", tile_size, "starting_resolution:", starting_resolution)
max_zoom = math.ceil(math.log(total_length / (tile_size * starting_resolution) ) / math.log(2))

# we're going to go through and create the data for the different
# zoom levels by summing adjacent data points
prev_resolution = curr_resolution
Expand Down Expand Up @@ -230,7 +237,7 @@ def create_multivec_multires(array_data, chromsizes,
new_shape[0] = math.ceil(new_shape[0] / 2)
new_shape = tuple(new_shape)

f['resolutions'][str(curr_resolution)]['values'].create_dataset(chrom,
f['resolutions'][str(curr_resolution)]['values'].create_dataset(chrom,
new_shape, compression='gzip')

while start < len(chrom_data):
Expand All @@ -239,7 +246,7 @@ def create_multivec_multires(array_data, chromsizes,
#print("prev_resolution:", prev_resolution)
#print("old_data.shape", old_data.shape)

# this is a sort of roundabout way of calculating the
# this is a sort of roundabout way of calculating the
# shape of the aggregated array, but all its doing is
# just halving the first dimension of the previous shape
# without taking into account the other dimensions
Expand All @@ -258,8 +265,8 @@ def create_multivec_multires(array_data, chromsizes,
new_data = agg(old_data)

'''
print("zoom_level:", max_zoom - 1 - i,
"resolution:", curr_resolution,
print("zoom_level:", max_zoom - 1 - i,
"resolution:", curr_resolution,
"new_data length", len(new_data))
'''
f['resolutions'][str(curr_resolution)]['values'][chrom][int(start/2):int(start/2+chunk_size/2)] = new_data
Expand Down
Binary file added test/sample_data/states_format_input_testfile.bed.gz
Binary file not shown.
Binary file not shown.
10 changes: 10 additions & 0 deletions test/sample_data/states_format_test_row_infos.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Quies
FaireW
Low
Pol2
Gen3'
Elon
Ctcf
EnhW
EnhWF
ElonW

0 comments on commit c119237

Please sign in to comment.