Skip to content

Commit

Permalink
Fix reading of SAM files
Browse files Browse the repository at this point in the history
Instead of using pandas, use the builtin csv module.

Pandas will not parse a SAM file correctly if it does not have regular
number of columns, which some SAM files dont.
  • Loading branch information
Donaim committed Jul 30, 2024
1 parent 28950eb commit 29c17d3
Showing 1 changed file with 23 additions and 19 deletions.
42 changes: 23 additions & 19 deletions gene_splicer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,9 @@ def csv_to_bed(csvfile, target_name='HXB2', offset_start=0, offset_stop=0):
})


def split_cigar(row):
def split_cigar(string):
pattern = re.compile(r'(\d+)([A-Z])')
cigar = re.findall(pattern, row[5])
cigar = re.findall(pattern, string)
return cigar


Expand Down Expand Up @@ -214,11 +214,12 @@ def modify_annot(annot):

def splice_genes(query, target, samfile, annotation):
results = {}
for i, row in samfile.iterrows():
for i, row in enumerate(samfile):
# Subtract 1 to convert target position to zero-base
target_pos = int(row[3]) - 1
query_pos = None
for size, op in row['cigar']:
cigar = row[5]
for size, op in split_cigar(cigar):
size = int(size)
# logger.debug(f'size: {size}, op: {op}')
# logger.debug(f'target_pos: {target_pos}, query_pos: {query_pos}')
Expand Down Expand Up @@ -277,11 +278,12 @@ def coords_to_genes(coords, query):
def splice_aligned_genes(query, target, samfile, annotation):
results = {}
sequences = {}
for i, row in samfile.iterrows():
for i, row in enumerate(samfile):
# Subtract 1 to convert target position to zero-base
target_pos = int(row[3]) - 1
query_pos = None
for size, op in row['cigar']:
cigar = row[5]
for size, op in split_cigar(cigar):
# print(f'size: {size}, op: {op}')
# print(f'target_pos: {target_pos}, query_pos: {query_pos}')
size = int(size)
Expand Down Expand Up @@ -334,19 +336,17 @@ def splice_aligned_genes(query, target, samfile, annotation):
return results, sequences


def load_samfile(samfile_path):
# Open the SAM file and find the starting point for data
def load_samfile(samfile_path: Path) -> List[List[str]]:
with open(samfile_path, 'r') as file:
# Skip meta fields
lines = file.readlines()
data_start_index = 0
for i, line in enumerate(lines):
if not line.startswith('@'):
data_start_index = i
break

result = pd.read_table(samfile_path, skiprows=data_start_index, header=None)
result['cigar'] = result.apply(split_cigar, axis=1)
reader = csv.reader(file, delimiter='\t')

result = []
for row in reader:
# Skip header lines (lines starting with '@')
if row[0].startswith('@'):
continue
result.append(row)

return result


Expand Down Expand Up @@ -594,12 +594,16 @@ def generate_table_precursor_2(hivseqinr_resultsfile, filtered_file,

def get_softclipped_region(query, alignment, alignment_path):
try:
size, op = alignment.iloc[0]['cigar'][0]
first_match = alignment[0]
except IndexError:
logger.warning('No alignment in %s!', alignment_path)
return

cigar = first_match[5]
size, op = split_cigar(cigar)[0]
if op != 'S':
return

size = int(size)
return query[:size]

Expand Down

0 comments on commit 29c17d3

Please sign in to comment.