Skip to content

Commit

Permalink
a couple more stats
Browse files Browse the repository at this point in the history
  • Loading branch information
gfudenberg committed Oct 8, 2018
1 parent 247483d commit d9dedda
Showing 1 changed file with 64 additions and 5 deletions.
69 changes: 64 additions & 5 deletions looplib/loopstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,24 @@
from inspect import getfullargspec
import pandas as pd

import bioframe
from bioframe.tools import tsv, bedtools
def bedtools_intersect_counts(left, right, **kwargs):
with tsv(left) as a, tsv(right) as b:
out = bedtools.intersect(a=a.name, b=b.name,c=True)
out.columns = list(left.columns) + ['counts']
return out

#### utilities ####

def load_pickle(filename):
return pickle.load(open( filename,'rb' ) )


def load_joblib_data(filename):
return joblib.load(filename.replace('SMC','block'))['data']


stat_output_dict = {} # create a dictionary of statistic outputs, to pass to the report

#### boundary_list independent statistics #####
Expand Down Expand Up @@ -35,7 +48,6 @@ def calc_loop_size(LEF_array,polymer_length):
def calc_LEF_stalling_by_leg(LEF_array, polymer_length, boundary_list):
''' calculates the fraction of LEFs with two, one, or no legs stalled at boundaries.
returns the fraction with both legs, one leg, or neither leg overlapping boundary_list '''
if LEF_array.shape[1] !=2: raise Exception('needs to be a numLEFs x 2 array of LEFs')
boundary_list = np.array(boundary_list,dtype=int).flatten()
isBoundary = np.histogram(boundary_list, np.arange(0,polymer_length+1))[0]
LEF_arm_status = np.sum( isBoundary[LEF_array] , 1)
Expand All @@ -45,7 +57,6 @@ def calc_LEF_stalling_by_leg(LEF_array, polymer_length, boundary_list):
def calc_boundary_occupancy(LEF_array, polymer_length, boundary_list):
''' calculates the fraction of LEFs with two, one, or no legs stalled at boundaries.
returns the fraction with both legs, one leg, or neither leg overlapping boundary_list '''
if LEF_array.shape[1] !=2: raise Exception('needs to be a numLEFs x 2 array of LEFs')
boundary_list = np.array(boundary_list,dtype=int).flatten()
bb = np.arange(0,polymer_length+1, 1); bb_mids = .5*(bb[:-1]+bb[1:])
extruderHistogram, b = np.histogram( LEF_array ,bb)
Expand All @@ -56,32 +67,80 @@ def calc_boundary_occupancy(LEF_array, polymer_length, boundary_list):
stat_output_dict[ calc_boundary_occupancy] = ['boundary_occupancy','non_occupancy']


def calc_boundary_crossing_percent(LEF_array, polymer_length, boundary_list):
''' calculates the fraction of LEFs with two, one, or no legs stalled at boundaries.
returns the fraction with both legs, one leg, or neither leg overlapping boundary_list '''
boundary_list = np.array(boundary_list,dtype=int).flatten()
allLocs_df = pd.DataFrame(boundary_list, columns=['start'])
allLocs_df['end'] = boundary_list
allLocs_df['chr'] = 13
allLocs_df = allLocs_df[['chr','start','end']]

LEF_array_df = pd.DataFrame(LEF_array,columns=['start','end'])
LEF_array_df['chr'] = 13
LEF_array_df= LEF_array_df[['chr','start','end']]
LEF_array_df.sort_values(['start'], inplace=True)
LEF_array_shrink_df = LEF_array_df.copy()
LEF_array_shrink_df['start'] = LEF_array_shrink_df['start'].values+1
LEF_array_shrink_df['end'] = np.maximum( LEF_array_shrink_df['end'].values-1, LEF_array_shrink_df['start'])

percent_crossing = np.sum((bedtools_intersect_counts( LEF_array_shrink_df, allLocs_df)['counts'] > 0)
/ len(LEF_array) )
return [percent_crossing]
stat_output_dict[ calc_boundary_crossing_percent] = ['percent_crossing']

def calc_numLoops_given_boundaryPair_contact(LEF_array, polymer_length, boundary_pair_array, data):
LEF_array_df = pd.DataFrame(LEF_array,columns=['start','end'])
LEF_array_df['chr'] = 13
LEF_array_df= LEF_array_df[['chr','start','end']]

sep2_dists = np.sum( (data[boundary_pair_array[:,0],:]
- data[boundary_pair_array[:,1],:])**2.,axis=1 )**.5
sep2_contacts = boundary_pair_array[ sep2_dists < 3 ,:]

sep2_contacts_df = pd.DataFrame(sep2_contacts,columns=['start','end'])
sep2_contacts_df['chr'] = 13
sep2_contacts_df= sep2_contacts_df[['chr','start','end']]
sep2_contacts_df.sort_values(['start'], inplace=True)

cbins = np.arange(0,len(LEF_array),1)
a,b = np.histogram( bedtools_intersect_counts(sep2_contacts_df, LEF_array_df)['counts'].values , cbins)
a = a/np.sum(a)
no_loops, single_loop, multiple_loops = ( a[0], a[1], np.sum(a[2:]) )
return [no_loops, single_loop, multiple_loops]
stat_output_dict[calc_numLoops_given_boundaryPair_contact]= ['no_loops','one_loop','two_plus_loops']


##### averaging and summarizing ####

def flatten(l): return flatten(l[0]) + (flatten(l[1:]) if len(l) > 1 else []) if type(l) is list else [l] # since list(itertools.chain.from_iterable(newlist)) doesn't quite cut it...

def _calc_stats(filelist, stat_function_list=[calc_coverage_by_LEFs], load_function=load_pickle, **kwargs):
def _calc_stats(filelist, stat_function_list=[calc_coverage_by_LEFs], load_function=load_pickle, load_function_data=None, **kwargs):
''' takes a list of filenames returns loop statistics over these files. can be parallelized in the future...'''
stat_list = []
for f in filelist:
try:
LEF_array = np.array(load_function(f),dtype=int)
if LEF_array.shape[1] !=2: raise Exception('needs to be a numLEFs x 2 array of LEFs')
if load_function_data != None:
data = np.array(load_function_data(f))
filestats = []
for stat_function in stat_function_list:
numInputs = len( getfullargspec(stat_function)[0])
if numInputs == 2:
filestats.append( stat_function(LEF_array, kwargs['polymer_length']))
elif numInputs == 3:
filestats.append( stat_function(LEF_array, kwargs['polymer_length'], kwargs['boundary_list']))
elif numInputs == 4:
filestats.append( stat_function(LEF_array, kwargs['polymer_length'], kwargs['boundary_pair_array'], data ))
stat_list.append(flatten(filestats))
except:
print('bad file', f); continue
return stat_list


def _create_loopstats_report(filelist_tuples, stat_function_list=[calc_coverage_by_LEFs],
load_function=load_pickle,roundDecimals=2, **kwargs):
load_function=load_pickle, load_function_data=None,roundDecimals=2, **kwargs):
''' averages the stat functions over each group of filelists; kwargs needed depend on functions called
usage: _create_loopstats_report([(filelist1,'smclife1'), (filelist2,'smclife2') ],
stat_function_list=[calc_loop_size,calc_coverage_by_LEFs, calc_LEF_stalling_by_leg, calc_boundary_occupancy],
Expand All @@ -91,7 +150,7 @@ def _create_loopstats_report(filelist_tuples, stat_function_list=[calc_coverage_
loopstats_indicies =[]
for filelist, filename in filelist_tuples:
loopstats_indicies.append(filename)
loopstats_report.append( np.mean( _calc_stats(filelist, stat_function_list, load_function, **kwargs),axis = 0) )
loopstats_report.append( np.mean( _calc_stats(filelist, stat_function_list, load_function, load_function_data, **kwargs),axis = 0) )
loopstats_report = np.array(loopstats_report)
if roundDecimals != None: loopstats_report = np.round(loopstats_report,roundDecimals)

Expand Down

0 comments on commit d9dedda

Please sign in to comment.