From 77e91e7831c4f637f7f7c62350675f931373a998 Mon Sep 17 00:00:00 2001 From: Wolfgang Maier Date: Wed, 1 Jul 2020 17:34:14 +0200 Subject: [PATCH] Print error messages on stderr consistently --- kcalign/__init__.py | 73 ++++++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 28 deletions(-) diff --git a/kcalign/__init__.py b/kcalign/__init__.py index 800fc04..8422962 100644 --- a/kcalign/__init__.py +++ b/kcalign/__init__.py @@ -1,11 +1,8 @@ -#!/usr/bin/env python - from Bio.Seq import Seq from Bio import SeqIO from Bio.SeqRecord import SeqRecord import os import pty -import shlex import sys import subprocess import warnings @@ -46,14 +43,14 @@ def invoke_kalign(input_file, output_file): just to be safe. """ if not os.path.exists(input_file): - print('Input file missing') + print('Input file missing', file=sys.stderr) return 1 # If STDIN is not a tty, kalign3 assumes that STDIN is an input file # and fails to detect its type. # Pass in a fake pty to work around this behavior, which is appropriate # for command-line usage but doesn't work in a headless environment. fakepty, alsofakepty = pty.openpty() - command = shlex.split(f'kalign -i {input_file} -o {output_file}') + command = ['kalign', '-i', input_file, '-o' output_file] kaligner = subprocess.Popen(command, stderr=subprocess.PIPE, stdout=subprocess.PIPE, stdin=fakepty) kaligner.wait() @@ -63,13 +60,13 @@ def invoke_kalign(input_file, output_file): # If Kalign fails try again with MAFFT (Kalign sometimes seg faults on some data) if kaligner.returncode != 0: with open(output_file, 'w') as out_fh: - command = shlex.split(f'mafft {input_file}') + command = ['mafft', input_file] mafft = subprocess.Popen(command, stdout=out_fh) mafft.wait() _, stderr = mafft.communicate() if mafft.returncode != 0: - sys.stderr.write('Error running mafft') - sys.stderr.write(stderr.decode('utf-8')) + print('Error running mafft', file=sys.stderr) + print(stderr.decode('utf-8'), file=sys.stderr) return 1 else: return 0 @@ -516,11 +513,7 @@ def compressor(seqs, names, ids, og_seqs): else: new_ids.append('MultiSeq'+str(count)+'_'+str(len(same))) count += 1 - name = '' - for s in same: - name += s - name += ',' - new_names.append(name[:-1]) + new_names.append(','.join(s for s in same)) new_seqs.append(seqs[ids.index(same[0])]) new_og_seqs[new_ids[-1]] = og_seqs[same[0]] return new_seqs, new_names, new_ids, new_og_seqs @@ -529,27 +522,36 @@ def compressor(seqs, names, ids, og_seqs): def check_input(reference, reads): """Check input FASTA is occupied and doesn't use invalid characters""" records = [] - invalids = ['E', 'F', 'I', 'J', 'L', 'O', 'P', 'Q', 'X', 'Z'] + invalids = {'E', 'F', 'I', 'J', 'L', 'O', 'P', 'Q', 'X', 'Z'} try: for record in SeqIO.parse(reference, 'fasta'): if any(x in invalids for x in record.seq): - print(f'Input Error: Invalid characters detected in reference sequence') + print( + 'Input Error: Invalid characters detected in reference sequence', + file=sys.stderr + ) return 1 records.append(records) if len(records) != 1: - print('User Error: reference input should contain exactly one sequence') + print( + 'User Error: reference input should contain exactly one sequence', + file=sys.stderr + ) return 1 records = [] for record in SeqIO.parse(reads, 'fasta'): if any(x in invalids for x in record.seq): - print(f'Input Error: Invalid characters detected in sequence ID: {record.id}') + print( + f'Input Error: Invalid characters detected in sequence ID: {record.id}', + file=sys.stderr + ) return 1 records.append(record) if len(records) == 0: - print('User Error: sequence FASTA file is empty') + print('User Error: sequence FASTA file is empty', file=sys.stderr) return 1 except Exception: - print('User Error: improperly formatted FASTA') + print('User Error: improperly formatted FASTA', file=sys.stderr) return 1 @@ -561,7 +563,10 @@ def check_tab(tab): if int(tab) in [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33]: tab = int(tab) else: - print('User Error: Chosen translation table number is invalid. See: https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi for valid options') + print( + 'User Error: Chosen translation table number is invalid. See: https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi for valid options', + file=sys.stderr + ) return 1 return tab @@ -575,23 +580,35 @@ def genome_mode(reference, reads, start, end, compress, para, tab): try: start = int(start)-1 end = int(end) - except Exception: - print('User Error: invalid start/end coordinate(s)') + except ValueError: + print( + 'User Error: invalid start/end coordinate(s)', + file=sys.stderr + ) return 1 join = 0 if start >= end: - print('User Error: start coordinate(s) must be less than the end coordinate(s)') + print( + 'User Error: start coordinate(s) must be less than the end coordinate(s)', + file=sys.stderr + ) return 1 else: try: start = (int(start.split(',')[0])-1, int(start.split(',')[1])-1) end = (int(end.split(',')[0]), int(end.split(',')[1])) - except Exception: - print('User Error: invalid start/end coordinate(s)') + except ValueError: + print( + 'User Error: invalid start/end coordinate(s)', + file=sys.stderr + ) return 1 join = 1 if start[0] >= end[0] or start[1] >= end[1]: - print('User Error: start coordinate(s) must be less than the end coordinate(s)') + print( + 'User Error: start coordinate(s) must be less than the end coordinate(s)', + file=sys.stderr + ) return 1 # Find protein sequence of gene of interest and extract the original DNA @@ -618,7 +635,7 @@ def genome_mode(reference, reads, start, end, compress, para, tab): og_seqs[idd] = og_seqs[idd][:-3] records = [SeqRecord(seq, id=idd, description=name)] if len(seqs) == 0: - print('No homologous sequences were found') + print('No homologous sequences were found', file=sys.stderr) return 1 combine_align(records, ids, names, seqs) names = dict(zip(ids, names)) @@ -695,7 +712,7 @@ def mixed_mode(reference, reads, compress, para, tab): og_seqs[idd] = og_seqs[idd][:-3] records = [SeqRecord(seq, id=idd, description=name)] if len(seqs) == 0: - print('No homologous sequences were found') + print('No homologous sequences were found', file=sys.stderr) return 1 combine_align(records, ids, names, seqs) names = dict(zip(ids, names))