Skip to content

Commit

Permalink
Update S01_script_to_choose.py to launch CAP3 first and then merge si…
Browse files Browse the repository at this point in the history
…nglets and contig and reformat headers to allow any assembler input file
  • Loading branch information
Charlotte Berthelier committed Sep 26, 2024
1 parent 143e113 commit f28e656
Showing 1 changed file with 74 additions and 64 deletions.
138 changes: 74 additions & 64 deletions scripts/01_Filter_Assemblies/S01_script_to_choose.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,49 +51,44 @@ def reformat_headers(input_file, output_file, prefix):
outfile.write(sequence + '\n')


def name_formatting(name, prefix):
if not os.path.isfile(name):
print(f"Error: File {name} does not exist.")
return ""

with open(name, "r") as f:
f1 = f.readline() # Only need to check first line to know the assembler which has been used

name_find_orf_input = ""

if f1.startswith(">Locus"):
name_remove_redondancy = os.path.join("outputs", f"02_{os.path.basename(name)}")
os.makedirs(os.path.dirname(name_remove_redondancy), exist_ok=True)
subprocess.run(["python", "S02a_remove_redondancy_from_velvet_oases.py", name, name_remove_redondancy], check=True)
name_find_orf_input = os.path.join("outputs", f"{prefix}_{os.path.basename(name)}")
sed_cmd = f"sed -e 's/Locus_/{prefix}/g' -e 's/_Confidence_/_/g' -e 's/_Transcript_/_/g' -e 's/_Length_/_/g' {name_remove_redondancy}"
sed_output = subprocess.check_output(sed_cmd, shell=True, text=True)
with open(name_find_orf_input, 'w') as file:
file.write(sed_output)
elif f1.startswith(">c"):
# Format the name of the sequences with good name
name_format_fasta = os.path.join("outputs", f"03_{os.path.basename(name)}")
os.makedirs(os.path.dirname(name_format_fasta), exist_ok=True)
subprocess.run(["python", "S02b_format_fasta_name_trinity.py", name, name_format_fasta, prefix], check=True)
# Apply first script to avoid redundant sequences
name_find_orf_input = os.path.join("outputs", f"04_{os.path.basename(name)}")
os.makedirs(os.path.dirname(name_find_orf_input), exist_ok=True)
subprocess.run(["python", "S03_choose_one_variants_per_locus_trinity.py", name_format_fasta, name_find_orf_input], check=True)
elif f1.startswith(">NODE"):
# Handle SPAdes output
name_find_orf_input = os.path.join("outputs", f"{prefix}_{os.path.basename(name)}")
os.makedirs(os.path.dirname(name_find_orf_input), exist_ok=True)
# No additional formatting is done, assuming SPAdes output doesn't need it
subprocess.run(["cp", name, name_find_orf_input], check=True)

return name_find_orf_input
def rename_fasta_headers(input_fasta, output_fasta):
# Extraire le nom de base du fichier (sans l'extension .fasta)
base_name_dir = input_fasta.split('.')[0]
base_name = base_name_dir.split('/')[1]
# Les deux premières lettres du nom de fichier
prefix = base_name[3:5]
# Liste pour stocker les nouvelles séquences
modified_sequences = []

# Lire le fichier fasta et modifier les en-têtes
for index, record in enumerate(SeqIO.parse(input_fasta, "fasta"), start=1):
# Longueur de la séquence
seq_length = len(record.seq)

# Créer le nouvel en-tête selon le format demandé
new_header = f">{prefix}{index}_1/1_1.000_{seq_length}"

# Modifier l'en-tête du record
record.id = new_header[1:] # [1:] pour enlever le ">"
record.description = ""

# Ajouter la séquence modifiée à la liste
modified_sequences.append(record)

# Nom du fichier de sortie
#output_fasta = f"{base_name_dir}_renamed.fasta"

# Écrire le fichier de sortie avec les nouvelles en-têtes
SeqIO.write(modified_sequences, output_fasta, "fasta")


def main():
if len(sys.argv) < 5:
print("Usage: script.py <input_files> <length_seq_min> <percent_identity> <overlap_length>")
sys.exit(1)

os.makedirs("outputs", exist_ok=True)
output_dir = "outputs_new" # Define the output directory
os.makedirs(output_dir, exist_ok=True)
length_seq_min = sys.argv[2]
percent_identity = sys.argv[3]
overlap_length = sys.argv[4]
Expand All @@ -103,36 +98,51 @@ def main():
print(f"Error: Input file {name} does not exist.")
continue

prefix = name[0:2]
name_fasta_formatter = os.path.join("outputs", f"01_{os.path.basename(name)}")
fasta_formatter(name, name_fasta_formatter)
name_find_orf_input = name_formatting(name_fasta_formatter, prefix)

if name_find_orf_input == "":
continue

# Apply the ORF finding script to keep the longest ORF
name_find_orf = os.path.join("outputs", f"05_{os.path.basename(name)}")
os.makedirs(os.path.dirname(name_find_orf), exist_ok=True)
subprocess.run(["python", "S04_find_orf.py", name_find_orf_input, name_find_orf], check=True)

# Apply CAP3
print(f"cap3{name_find_orf} -p {percent_identity} -o {overlap_length}")
subprocess.run(["cap3", name_find_orf, "-p", percent_identity, "-o", overlap_length], check=True)
# Get the base file name
file_name = os.path.basename(name)
# Define the output file path in the output directory
output_file_path = os.path.join(output_dir, file_name)
# Create a symbolic link for the input file in the output directory
symlink_path = os.path.join(output_dir, file_name)
if not os.path.exists(symlink_path):
os.symlink(os.path.abspath(name), symlink_path)

# Print and run the CAP3 command
print(f"cap3 {output_file_path} -p {percent_identity} -o {overlap_length}")
subprocess.run(["cap3", output_file_path, "-p", percent_identity, "-o", overlap_length], check=True)

# Format file to have sequence in one line
name_fasta_formatter = os.path.join(output_dir, f"02_{os.path.basename(name)}")
fasta_formatter(f"{output_file_path}.cap.singlets", name_fasta_formatter)

# Merge singlets and contigs
singlets_output = subprocess.check_output(["zcat", "-f", f"{name_find_orf}.cap.singlets"], text=True)
singlets_output_file = os.path.join("outputs", f"{prefix}_singlets.fasta")
with open(singlets_output_file, 'w') as file:
file.write(singlets_output)

# Apply filter script
name_filter = os.path.join("outputs", f"05_{os.path.basename(name)}")
subprocess.run(["python", "S05_filter.py", singlets_output_file, length_seq_min], check=True)

# Reformat headers in the final output file
final_output_file = os.path.join("outputs", f"{prefix}{name}")
reformat_headers(name_filter, final_output_file, prefix)
merged_file = os.path.join(output_dir, f"03_{file_name}_merged.fasta")
# Define paths for CAP3 output files
cap_singlets_file = os.path.join(output_dir, f"{file_name}.cap.singlets")
cap_contigs_file = os.path.join(output_dir, f"{file_name}.cap.contigs")
print(f"{cap_singlets_file} and {cap_contigs_file}")

with open(merged_file, 'w') as outfile:
# Write the contents of the contigs file first
if os.path.exists(cap_contigs_file):
with open(cap_contigs_file, 'r') as contigs:
outfile.write(contigs.read())
# Append the contents of the singlets file
if os.path.exists(cap_singlets_file):
with open(cap_singlets_file, 'r') as singlets:
outfile.write(singlets.read())

# Reformat headers
name_fasta_final = os.path.join(output_dir, f"04_{os.path.basename(name)}")
rename_fasta_headers(merged_file, name_fasta_final)

# Format final file to have sequence in one line
prefix = file_name[:2]
tmp = prefix + os.path.basename(name)
name_final_file = os.path.join(output_dir, tmp)
fasta_formatter(name_fasta_final, name_final_file)


if __name__ == "__main__":
main()

0 comments on commit f28e656

Please sign in to comment.