From 05aac4f7fd48d6bf2f586ff46894be4aa483c0dc Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 9 Jul 2024 11:48:52 -0700 Subject: [PATCH] Improve reading of .SAM files The data section does not always begin at line 3. This improvement makes the code able to handle those situations where it does not. --- gene_splicer/utils.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/gene_splicer/utils.py b/gene_splicer/utils.py index 875fe95..effd718 100644 --- a/gene_splicer/utils.py +++ b/gene_splicer/utils.py @@ -334,7 +334,17 @@ def splice_aligned_genes(query, target, samfile, annotation): def load_samfile(samfile_path): - result = pd.read_table(samfile_path, skiprows=2, header=None) + # Open the SAM file and find the starting point for data + with open(samfile_path, 'r') as file: + # Skip meta fields + lines = file.readlines() + data_start_index = 0 + for i, line in enumerate(lines): + if not line.startswith('@'): + data_start_index = i + break + + result = pd.read_table(samfile_path, skiprows=data_start_index, header=None) result['cigar'] = result.apply(split_cigar, axis=1) return result