From 6098677bdf7b7d90d6a41b2c02b6c811781db5d2 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 29 Jul 2024 17:50:15 -0700 Subject: [PATCH] Fix reading of SAM files Instead of using pandas, use the builtin csv module. Pandas will not parse a SAM file correctly if it does not have regular number of columns, which some SAM files dont. --- gene_splicer/utils.py | 42 +++++++++++++++++++--------------- tests/test_utils/test_utils.py | 5 ++-- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/gene_splicer/utils.py b/gene_splicer/utils.py index e2c49f9..4985ba5 100644 --- a/gene_splicer/utils.py +++ b/gene_splicer/utils.py @@ -101,9 +101,9 @@ def csv_to_bed(csvfile, target_name='HXB2', offset_start=0, offset_stop=0): }) -def split_cigar(row): +def split_cigar(string): pattern = re.compile(r'(\d+)([A-Z])') - cigar = re.findall(pattern, row[5]) + cigar = re.findall(pattern, string) return cigar @@ -214,11 +214,12 @@ def modify_annot(annot): def splice_genes(query, target, samfile, annotation): results = {} - for i, row in samfile.iterrows(): + for i, row in enumerate(samfile): # Subtract 1 to convert target position to zero-base target_pos = int(row[3]) - 1 query_pos = None - for size, op in row['cigar']: + cigar = row[5] + for size, op in split_cigar(cigar): size = int(size) # logger.debug(f'size: {size}, op: {op}') # logger.debug(f'target_pos: {target_pos}, query_pos: {query_pos}') @@ -277,11 +278,12 @@ def coords_to_genes(coords, query): def splice_aligned_genes(query, target, samfile, annotation): results = {} sequences = {} - for i, row in samfile.iterrows(): + for i, row in enumerate(samfile): # Subtract 1 to convert target position to zero-base target_pos = int(row[3]) - 1 query_pos = None - for size, op in row['cigar']: + cigar = row[5] + for size, op in split_cigar(cigar): # print(f'size: {size}, op: {op}') # print(f'target_pos: {target_pos}, query_pos: {query_pos}') size = int(size) @@ -334,19 +336,17 @@ def splice_aligned_genes(query, target, samfile, annotation): return results, sequences -def load_samfile(samfile_path): - # Open the SAM file and find the starting point for data +def load_samfile(samfile_path: Path) -> List[List[str]]: with open(samfile_path, 'r') as file: - # Skip meta fields - lines = file.readlines() - data_start_index = 0 - for i, line in enumerate(lines): - if not line.startswith('@'): - data_start_index = i - break - - result = pd.read_table(samfile_path, skiprows=data_start_index, header=None) - result['cigar'] = result.apply(split_cigar, axis=1) + reader = csv.reader(file, delimiter='\t') + + result = [] + for row in reader: + # Skip header lines (lines starting with '@') + if row[0].startswith('@'): + continue + result.append(row) + return result @@ -594,12 +594,16 @@ def generate_table_precursor_2(hivseqinr_resultsfile, filtered_file, def get_softclipped_region(query, alignment, alignment_path): try: - size, op = alignment.iloc[0]['cigar'][0] + first_match = alignment[0] except IndexError: logger.warning('No alignment in %s!', alignment_path) return + + cigar = first_match[5] + size, op = split_cigar(cigar)[0] if op != 'S': return + size = int(size) return query[:size] diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index d378d2c..28d0adc 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -16,7 +16,8 @@ def test_get_softclip_start(): # Normally this alignment would be generated by a separate function aln_path = example / 'alignment.sam' aln = utils.load_samfile(aln_path) - size, op = aln.iloc[0]['cigar'][0] + cigar = aln[0][5] + size, op = utils.split_cigar(cigar)[0] size = int(size) query_fasta = Fasta(example / 'query.fasta') query_sequence = None @@ -110,4 +111,4 @@ def test_getSamplesFromCascade(): samples = utils.get_samples_from_cascade(cascade) assert len(samples) == 10 for i in range(10): - assert samples[str(i)] == i \ No newline at end of file + assert samples[str(i)] == i