Skip to content

Commit

Permalink
New phylophlan integration, no faking anymore
Browse files Browse the repository at this point in the history
  • Loading branch information
PuncocharM committed Sep 27, 2023
1 parent f7810d3 commit 8d58413
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 69 deletions.
24 changes: 6 additions & 18 deletions metaphlan/strainphlan.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,8 @@ def copy_filtered_references(self, markers_tmp_dir):
'.bz2') else os.path.splitext(os.path.basename(reference))[0]

if sample_name in [os.path.splitext(os.path.basename(sample))[0] for sample in self.cleaned_markers_matrix.index]:
reference_marker = os.path.join(self.tmp_dir, "reference_markers", f'{sample_name}.fna')
copyfile(reference_marker, os.path.join(markers_tmp_dir, f"{sample_name}.fna"))
reference_marker = os.path.join(self.tmp_dir, "reference_markers", f'{sample_name}.fna.bz2')
copyfile(reference_marker, os.path.join(markers_tmp_dir, f"{sample_name}.fna.bz2"))

def matrix_markers_to_fasta(self):
"""For each sample, writes the FASTA files with the sequences of the filtered markers
Expand Down Expand Up @@ -212,21 +212,12 @@ def sample_markers_to_fasta(sample_path, filtered_samples, filtered_markers, tri
"""
if sample_path in filtered_samples:
sample_name = os.path.splitext(os.path.basename(sample_path))[0].replace(".pkl", "")
marker_output_file = os.path.join(markers_tmp_dir, '{}.fna'.format(sample_name))
marker_output_file = os.path.join(markers_tmp_dir, '{}.fna.bz2'.format(sample_name))
sample = ConsensusMarkers.from_file(sample_path)
sample.consensus_markers = [m for m in sample.consensus_markers if m.name in filtered_markers]
sample.to_fasta(marker_output_file, trim_ends=trim_sequences)


def cleaned_clade_markers_to_fasta(self):
"""Writes a FASTA file with the sequences of the filtered clade markers"""
clade_tmp_dir = os.path.join(self.tmp_dir, self.clade[:30])
create_folder(clade_tmp_dir)
for m in self.cleaned_markers_matrix.columns.tolist():
marker = ConsensusMarker(m, self.db_clade_markers[m])
markers = ConsensusMarkers([marker])
markers.to_fasta(os.path.join(clade_tmp_dir, '{}.fna'.format(marker.parse_marker_name())), trim_ends=self.trim_sequences)

def get_markers_from_references(self):
"""Gets markers from reference files and returns the marker matrix with the reference markers
Expand Down Expand Up @@ -264,7 +255,7 @@ def process_reference(cls, reference_file, tmp_dir, clade_markers_file, clade_ma
os.makedirs(reference_markers_dir, exist_ok=True)

consensus_markers = ConsensusMarkers([ConsensusMarker(m, s) for m, s in ext_markers.items()])
consensus_markers.to_fasta(os.path.join(reference_markers_dir, f'{name}.fna'), trim_ends=trim_sequences)
consensus_markers.to_fasta(os.path.join(reference_markers_dir, f'{name}.fna.bz2'), trim_ends=trim_sequences)

markers_matrix = {'sample': reference_file}
markers_matrix.update({m: int(m in ext_markers) for m in clade_markers})
Expand Down Expand Up @@ -530,15 +521,12 @@ def run_strainphlan(self):
info("Writing samples as markers' FASTA files...")
samples_as_markers_dir = self.matrix_markers_to_fasta()
info("Done.")
info("Writing filtered clade markers as FASTA file...")
self.cleaned_clade_markers_to_fasta()
info("Done.")
info("Calculating polymorphic rates...")
self.calculate_polymorphic_rates()
info("Done.")
info("Executing PhyloPhlAn...")
self.phylophlan_controller.compute_phylogeny(samples_as_markers_dir, len(
self.cleaned_markers_matrix), self.min_markers, self.tmp_dir)
self.phylophlan_controller.compute_phylogeny(samples_as_markers_dir, len(self.cleaned_markers_matrix),
self.tmp_dir)
info("Done.")
info("Writing information file...")
self.write_info()
Expand Down
2 changes: 1 addition & 1 deletion metaphlan/utils/consensus_markers.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def to_json(self, output_path):
def to_fasta(self, output_file, trim_ends=0):
"""Writes the consensus markers to FASTA"""
seq_records = [m.to_seq_record(trim_ends) for m in self.consensus_markers]
with open(output_file, 'w') as f:
with bz2.open(output_file, 'wt') as f:
SeqIO.write(seq_records, f, 'fasta')

def __str__(self):
Expand Down
41 changes: 25 additions & 16 deletions metaphlan/utils/external_exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,17 @@ def execute(cmd):
cmd (dict): the dict with the command line information
"""
inp_f = None
out_f = sb.DEVNULL
out_f = sb.PIPE
if cmd['stdin']:
inp_f = open(cmd['stdin'], 'r')
if cmd['stdout']:
out_f = open(cmd['stdout'], 'w')
exec_res = sb.run(cmd['command_line'], stdin=inp_f, stdout=out_f)
if exec_res.returncode == 1:
error("An error was ocurred executing a external tool, exiting...", exit=True)
print(exec_res.stdout)
print('===')
print(exec_res.stderr)
if cmd['stdin']:
inp_f.close()
if cmd['stdout']:
Expand Down Expand Up @@ -105,24 +108,30 @@ def generate_phylophlan_config_file(output_dir, configuration):
return conf_file


def execute_phylophlan(samples_markers_dir, conf_file, min_entries, min_markers, tmp_dir, output_dir, clade, phylogeny_conf, additional_params, mutation_rates, nprocs):
def execute_phylophlan(samples_markers_dir, conf_file, tmp_dir, output_dir, phylogeny_conf, additional_params, mutation_rates, nprocs):
"""Executes PhyloPhlAn"""
advanced_params = "--" + phylogeny_conf
cmd = f'phylophlan -i {samples_markers_dir} -o . --output_folder {output_dir} --nproc {nprocs} --strainphlan' \
f' --{phylogeny_conf} --data_folder {tmp_dir} -t n -f {conf_file} --diversity low' \
f' --genome_extension fna --min_num_entries 1 --min_num_markers 1'

if additional_params is not None:
advanced_params += " " + additional_params
cmd += " " + additional_params
if mutation_rates:
advanced_params += " --mutation_rates"
params = {
"program_name": "phylophlan",
"params": "-d {} --data_folder {} --databases_folder {} -t n -f {} --diversity low {} --genome_extension fna --force_nucleotides --min_num_entries {} --convert_N2gap --min_num_markers {}".format(clade[:30], tmp_dir, tmp_dir, conf_file, advanced_params, min_entries, min_markers),
"input": "-i",
"output_path": "--output_folder",
"output": "-o",
"threads": "--nproc",
"command_line": "#program_name# #input# #output# #output_path# #params# #threads#"
}
execute(compose_command(params=params, input_file=samples_markers_dir, output_path=output_dir,
output_file=".", nproc=nprocs))
cmd += " --mutation_rates"

r = sb.run(cmd, capture_output=True, shell=True)
with open(os.path.join(tmp_dir, "phylophlan_log.stdout"), 'wb') as f:
f.write(r.stdout)
with open(os.path.join(tmp_dir, "phylophlan_log.stderr"), 'wb') as f:
f.write(r.stderr)
if r.returncode != 0:
error("Error executing PhyloPhlAn")
info('== stdout ==')
print(r.stdout.decode())
info('== stderr ==')
print(r.stderr.decode())
info('exiting')
exit(1)


def execute_treeshrink(input_tree, output_dir, tmp=None, centroid=False, q_value=0.05, m_value='all-genes'):
Expand Down
41 changes: 7 additions & 34 deletions metaphlan/utils/phylophlan_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,50 +40,23 @@ def get_phylophlan_configuration(self):
class Phylophlan3Controller(PhylophlanController):
"""PhyloPhlAn3Controller class"""

def fake_phylophlan_inputs(self, samples_markers_dir):
"""Fakes the PhyloPhlAn inputs for the reconstructed markers

Args:
samples_markers_dir (str): Directory containing the samples as FASTA
"""
os.mkdir(os.path.join(self.tmp_dir, 'markers_dna'))
os.mkdir(os.path.join(self.tmp_dir, 'map_dna'))
os.mkdir(os.path.join(self.tmp_dir, 'clean_dna'))
for sample in os.listdir(samples_markers_dir):
open(os.path.join(self.tmp_dir, 'map_dna',
sample.replace('.fna', '.b6o.bkp')), 'a').close()
open(os.path.join(self.tmp_dir, 'map_dna',
sample.replace('.fna', '.b6o.bz2')), 'a').close()
copy(os.path.join(samples_markers_dir, sample),
os.path.join(self.tmp_dir, 'clean_dna', sample))
with bz2.open(os.path.join(self.tmp_dir, 'markers_dna', '{}.bz2'.format(sample)), 'wt') as write_file:
with open(os.path.join(samples_markers_dir, sample), 'r') as read_file:
for line in read_file:
write_file.write(line)

def compute_phylogeny(self, samples_markers_dir, num_samples, min_markers, temporal_dir):
def compute_phylogeny(self, samples_markers_dir, num_samples, tmp_dir):
"""Executes PhyloPhlAn to compute phylogeny
Args:
samples_markers_dir (str): Directory containing the samples as FASTA
num_samples (int): Minimum number or samples containing a marker
min_markers (int): Mininum number of markers per sample
temporal_dir (str): Temporal directory
num_samples (int): Minimum number of samples containing a marker
tmp_dir (str): Temporal directory
"""
info("\tCreating PhyloPhlAn database...")
self.tmp_dir = temporal_dir
create_phylophlan_db(self.tmp_dir, self.clade[:30])
info("\tDone.")
if not self.phylophlan_configuration:
info("\tGenerating PhyloPhlAn configuration file...")
self.phylophlan_configuration = generate_phylophlan_config_file(
self.tmp_dir, self.get_phylophlan_configuration())
tmp_dir, self.get_phylophlan_configuration())
info("\tDone.")
self.fake_phylophlan_inputs(samples_markers_dir)
info("\tProcessing samples...")
min_entries = self.marker_in_n_samples if self.abs_n_samples_thres else self.marker_in_n_samples * num_samples // 100
execute_phylophlan(samples_markers_dir, self.phylophlan_configuration, min_entries, int(round(min_markers, 0)),
self.tmp_dir, self.output_dir, self.clade, self.phylophlan_mode, self.phylophlan_params, self.mutation_rates, self.nprocs)
execute_phylophlan(samples_markers_dir, self.phylophlan_configuration, tmp_dir, self.output_dir,
self.phylophlan_mode, self.phylophlan_params, self.mutation_rates, self.nprocs)
if self.mutation_rates:
move(os.path.join(self.output_dir, "mutation_rates.tsv"), os.path.join(
self.output_dir, "{}.mutation".format(self.clade)))
Expand All @@ -93,7 +66,7 @@ def compute_phylogeny(self, samples_markers_dir, num_samples, min_markers, tempo
if self.treeshrink:
info("Executing TreeShrink...")
execute_treeshrink(os.path.join(self.output_dir, 'RAxML_bestTree.{}.StrainPhlAn4.tre'.format(
self.clade)), self.output_dir, tmp=self.tmp_dir, centroid=True if num_samples >= 100 else False)
self.clade)), self.output_dir, tmp=tmp_dir, centroid=True if num_samples >= 100 else False)
info("Done.")
info('This StrainPhlAn analysis ran TreeShrink, do not forget to cite:')
info('Mai, Uyen, and Siavash Mirarab. 2018. “TreeShrink: Fast and Accurate Detection of Outlier Long Branches in Collections of Phylogenetic Trees.” BMC Genomics 19 (S5): 272. https://doi.org/10.1186/s12864-018-4620-2.')
Expand Down

0 comments on commit 8d58413

Please sign in to comment.