diff --git a/setup.py b/setup.py index 6ec23cb..cea3b86 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ name=__pkg_name__, license="MIT", description='xpore is a python package for Nanopore data analysis.', - version='v0.5.5', + version='v0.5.6', long_description=README, long_description_content_type='text/markdown', url='https://github.com/GoekeLab/xpore', diff --git a/xpore/scripts/dataprep.py b/xpore/scripts/dataprep.py index 5547564..e54312e 100644 --- a/xpore/scripts/dataprep.py +++ b/xpore/scripts/dataprep.py @@ -41,49 +41,54 @@ def combine(read_name,eventalign_per_read,out_paths,locks): eventalign_result = pd.DataFrame.from_records(eventalign_per_read) cond_successfully_eventaligned = eventalign_result['reference_kmer'] == eventalign_result['model_kmer'] - eventalign_result = eventalign_result[cond_successfully_eventaligned] + + if cond_successfully_eventaligned.sum() != 0: - keys = ['read_index','contig','position','reference_kmer'] # for groupby - eventalign_result['length'] = pd.to_numeric(eventalign_result['end_idx'])-pd.to_numeric(eventalign_result['start_idx']) - eventalign_result['sum_norm_mean'] = pd.to_numeric(eventalign_result['event_level_mean']) * eventalign_result['length'] + eventalign_result = eventalign_result[cond_successfully_eventaligned] - eventalign_result = eventalign_result.groupby(keys) - sum_norm_mean = eventalign_result['sum_norm_mean'].sum() - start_idx = eventalign_result['start_idx'].min() - end_idx = eventalign_result['end_idx'].max() - total_length = eventalign_result['length'].sum() + keys = ['read_index','contig','position','reference_kmer'] # for groupby + eventalign_result['length'] = pd.to_numeric(eventalign_result['end_idx'])-pd.to_numeric(eventalign_result['start_idx']) + eventalign_result['sum_norm_mean'] = pd.to_numeric(eventalign_result['event_level_mean']) * eventalign_result['length'] - eventalign_result = pd.concat([start_idx,end_idx],axis=1) - eventalign_result['norm_mean'] = sum_norm_mean/total_length + eventalign_result = eventalign_result.groupby(keys) + sum_norm_mean = eventalign_result['sum_norm_mean'].sum() + start_idx = eventalign_result['start_idx'].min() + end_idx = eventalign_result['end_idx'].max() + total_length = eventalign_result['length'].sum() - eventalign_result.reset_index(inplace=True) + eventalign_result = pd.concat([start_idx,end_idx],axis=1) + eventalign_result['norm_mean'] = sum_norm_mean/total_length - # eventalign_result['transcript_id'] = [contig.split('.')[0] for contig in eventalign_result['contig']] - eventalign_result['transcript_id'] = eventalign_result['contig'] - eventalign_result['transcriptomic_position'] = pd.to_numeric(eventalign_result['position']) + 2 # the middle position of 5-mers. - # eventalign_result = misc.str_encode(eventalign_result) - eventalign_result['read_id'] = [read_name]*len(eventalign_result) + eventalign_result.reset_index(inplace=True) - # features = ['read_id','transcript_id','transcriptomic_position','reference_kmer','norm_mean','start_idx','end_idx'] - # features_dtype = np.dtype([('read_id', 'S36'), ('transcript_id', 'S15'), ('transcriptomic_position', '