diff --git a/metaphlan/strainphlan.py b/metaphlan/strainphlan.py index ecc252d..28a4c7a 100755 --- a/metaphlan/strainphlan.py +++ b/metaphlan/strainphlan.py @@ -180,8 +180,8 @@ def copy_filtered_references(self, markers_tmp_dir): '.bz2') else os.path.splitext(os.path.basename(reference))[0] if sample_name in [os.path.splitext(os.path.basename(sample))[0] for sample in self.cleaned_markers_matrix.index]: - reference_marker = os.path.join(self.tmp_dir, "reference_markers", f'{sample_name}.fna') - copyfile(reference_marker, os.path.join(markers_tmp_dir, f"{sample_name}.fna")) + reference_marker = os.path.join(self.tmp_dir, "reference_markers", f'{sample_name}.fna.bz2') + copyfile(reference_marker, os.path.join(markers_tmp_dir, f"{sample_name}.fna.bz2")) def matrix_markers_to_fasta(self): """For each sample, writes the FASTA files with the sequences of the filtered markers @@ -212,21 +212,12 @@ def sample_markers_to_fasta(sample_path, filtered_samples, filtered_markers, tri """ if sample_path in filtered_samples: sample_name = os.path.splitext(os.path.basename(sample_path))[0].replace(".pkl", "") - marker_output_file = os.path.join(markers_tmp_dir, '{}.fna'.format(sample_name)) + marker_output_file = os.path.join(markers_tmp_dir, '{}.fna.bz2'.format(sample_name)) sample = ConsensusMarkers.from_file(sample_path) sample.consensus_markers = [m for m in sample.consensus_markers if m.name in filtered_markers] sample.to_fasta(marker_output_file, trim_ends=trim_sequences) - def cleaned_clade_markers_to_fasta(self): - """Writes a FASTA file with the sequences of the filtered clade markers""" - clade_tmp_dir = os.path.join(self.tmp_dir, self.clade[:30]) - create_folder(clade_tmp_dir) - for m in self.cleaned_markers_matrix.columns.tolist(): - marker = ConsensusMarker(m, self.db_clade_markers[m]) - markers = ConsensusMarkers([marker]) - markers.to_fasta(os.path.join(clade_tmp_dir, '{}.fna'.format(marker.parse_marker_name())), trim_ends=self.trim_sequences) - def get_markers_from_references(self): """Gets markers from reference files and returns the marker matrix with the reference markers @@ -264,7 +255,7 @@ def process_reference(cls, reference_file, tmp_dir, clade_markers_file, clade_ma os.makedirs(reference_markers_dir, exist_ok=True) consensus_markers = ConsensusMarkers([ConsensusMarker(m, s) for m, s in ext_markers.items()]) - consensus_markers.to_fasta(os.path.join(reference_markers_dir, f'{name}.fna'), trim_ends=trim_sequences) + consensus_markers.to_fasta(os.path.join(reference_markers_dir, f'{name}.fna.bz2'), trim_ends=trim_sequences) markers_matrix = {'sample': reference_file} markers_matrix.update({m: int(m in ext_markers) for m in clade_markers}) @@ -530,15 +521,12 @@ def run_strainphlan(self): info("Writing samples as markers' FASTA files...") samples_as_markers_dir = self.matrix_markers_to_fasta() info("Done.") - info("Writing filtered clade markers as FASTA file...") - self.cleaned_clade_markers_to_fasta() - info("Done.") info("Calculating polymorphic rates...") self.calculate_polymorphic_rates() info("Done.") info("Executing PhyloPhlAn...") - self.phylophlan_controller.compute_phylogeny(samples_as_markers_dir, len( - self.cleaned_markers_matrix), self.min_markers, self.tmp_dir) + self.phylophlan_controller.compute_phylogeny(samples_as_markers_dir, len(self.cleaned_markers_matrix), + self.tmp_dir) info("Done.") info("Writing information file...") self.write_info() diff --git a/metaphlan/utils/consensus_markers.py b/metaphlan/utils/consensus_markers.py index 5f6f14f..0208da6 100644 --- a/metaphlan/utils/consensus_markers.py +++ b/metaphlan/utils/consensus_markers.py @@ -160,7 +160,7 @@ def to_json(self, output_path): def to_fasta(self, output_file, trim_ends=0): """Writes the consensus markers to FASTA""" seq_records = [m.to_seq_record(trim_ends) for m in self.consensus_markers] - with open(output_file, 'w') as f: + with bz2.open(output_file, 'wt') as f: SeqIO.write(seq_records, f, 'fasta') def __str__(self): diff --git a/metaphlan/utils/external_exec.py b/metaphlan/utils/external_exec.py index 5c32ac4..fe873c5 100755 --- a/metaphlan/utils/external_exec.py +++ b/metaphlan/utils/external_exec.py @@ -25,7 +25,7 @@ def execute(cmd): cmd (dict): the dict with the command line information """ inp_f = None - out_f = sb.DEVNULL + out_f = sb.PIPE if cmd['stdin']: inp_f = open(cmd['stdin'], 'r') if cmd['stdout']: @@ -33,6 +33,9 @@ def execute(cmd): exec_res = sb.run(cmd['command_line'], stdin=inp_f, stdout=out_f) if exec_res.returncode == 1: error("An error was ocurred executing a external tool, exiting...", exit=True) + print(exec_res.stdout) + print('===') + print(exec_res.stderr) if cmd['stdin']: inp_f.close() if cmd['stdout']: @@ -105,24 +108,30 @@ def generate_phylophlan_config_file(output_dir, configuration): return conf_file -def execute_phylophlan(samples_markers_dir, conf_file, min_entries, min_markers, tmp_dir, output_dir, clade, phylogeny_conf, additional_params, mutation_rates, nprocs): +def execute_phylophlan(samples_markers_dir, conf_file, tmp_dir, output_dir, phylogeny_conf, additional_params, mutation_rates, nprocs): """Executes PhyloPhlAn""" - advanced_params = "--" + phylogeny_conf + cmd = f'phylophlan -i {samples_markers_dir} -o . --output_folder {output_dir} --nproc {nprocs} --strainphlan' \ + f' --{phylogeny_conf} --data_folder {tmp_dir} -t n -f {conf_file} --diversity low' \ + f' --genome_extension fna --min_num_entries 1 --min_num_markers 1' + if additional_params is not None: - advanced_params += " " + additional_params + cmd += " " + additional_params if mutation_rates: - advanced_params += " --mutation_rates" - params = { - "program_name": "phylophlan", - "params": "-d {} --data_folder {} --databases_folder {} -t n -f {} --diversity low {} --genome_extension fna --force_nucleotides --min_num_entries {} --convert_N2gap --min_num_markers {}".format(clade[:30], tmp_dir, tmp_dir, conf_file, advanced_params, min_entries, min_markers), - "input": "-i", - "output_path": "--output_folder", - "output": "-o", - "threads": "--nproc", - "command_line": "#program_name# #input# #output# #output_path# #params# #threads#" - } - execute(compose_command(params=params, input_file=samples_markers_dir, output_path=output_dir, - output_file=".", nproc=nprocs)) + cmd += " --mutation_rates" + + r = sb.run(cmd, capture_output=True, shell=True) + with open(os.path.join(tmp_dir, "phylophlan_log.stdout"), 'wb') as f: + f.write(r.stdout) + with open(os.path.join(tmp_dir, "phylophlan_log.stderr"), 'wb') as f: + f.write(r.stderr) + if r.returncode != 0: + error("Error executing PhyloPhlAn") + info('== stdout ==') + print(r.stdout.decode()) + info('== stderr ==') + print(r.stderr.decode()) + info('exiting') + exit(1) def execute_treeshrink(input_tree, output_dir, tmp=None, centroid=False, q_value=0.05, m_value='all-genes'): diff --git a/metaphlan/utils/phylophlan_controller.py b/metaphlan/utils/phylophlan_controller.py index 97ce7f9..0b7689f 100644 --- a/metaphlan/utils/phylophlan_controller.py +++ b/metaphlan/utils/phylophlan_controller.py @@ -40,50 +40,23 @@ def get_phylophlan_configuration(self): class Phylophlan3Controller(PhylophlanController): """PhyloPhlAn3Controller class""" - def fake_phylophlan_inputs(self, samples_markers_dir): - """Fakes the PhyloPhlAn inputs for the reconstructed markers - Args: - samples_markers_dir (str): Directory containing the samples as FASTA - """ - os.mkdir(os.path.join(self.tmp_dir, 'markers_dna')) - os.mkdir(os.path.join(self.tmp_dir, 'map_dna')) - os.mkdir(os.path.join(self.tmp_dir, 'clean_dna')) - for sample in os.listdir(samples_markers_dir): - open(os.path.join(self.tmp_dir, 'map_dna', - sample.replace('.fna', '.b6o.bkp')), 'a').close() - open(os.path.join(self.tmp_dir, 'map_dna', - sample.replace('.fna', '.b6o.bz2')), 'a').close() - copy(os.path.join(samples_markers_dir, sample), - os.path.join(self.tmp_dir, 'clean_dna', sample)) - with bz2.open(os.path.join(self.tmp_dir, 'markers_dna', '{}.bz2'.format(sample)), 'wt') as write_file: - with open(os.path.join(samples_markers_dir, sample), 'r') as read_file: - for line in read_file: - write_file.write(line) - - def compute_phylogeny(self, samples_markers_dir, num_samples, min_markers, temporal_dir): + def compute_phylogeny(self, samples_markers_dir, num_samples, tmp_dir): """Executes PhyloPhlAn to compute phylogeny Args: samples_markers_dir (str): Directory containing the samples as FASTA - num_samples (int): Minimum number or samples containing a marker - min_markers (int): Mininum number of markers per sample - temporal_dir (str): Temporal directory + num_samples (int): Minimum number of samples containing a marker + tmp_dir (str): Temporal directory """ - info("\tCreating PhyloPhlAn database...") - self.tmp_dir = temporal_dir - create_phylophlan_db(self.tmp_dir, self.clade[:30]) - info("\tDone.") if not self.phylophlan_configuration: info("\tGenerating PhyloPhlAn configuration file...") self.phylophlan_configuration = generate_phylophlan_config_file( - self.tmp_dir, self.get_phylophlan_configuration()) + tmp_dir, self.get_phylophlan_configuration()) info("\tDone.") - self.fake_phylophlan_inputs(samples_markers_dir) info("\tProcessing samples...") - min_entries = self.marker_in_n_samples if self.abs_n_samples_thres else self.marker_in_n_samples * num_samples // 100 - execute_phylophlan(samples_markers_dir, self.phylophlan_configuration, min_entries, int(round(min_markers, 0)), - self.tmp_dir, self.output_dir, self.clade, self.phylophlan_mode, self.phylophlan_params, self.mutation_rates, self.nprocs) + execute_phylophlan(samples_markers_dir, self.phylophlan_configuration, tmp_dir, self.output_dir, + self.phylophlan_mode, self.phylophlan_params, self.mutation_rates, self.nprocs) if self.mutation_rates: move(os.path.join(self.output_dir, "mutation_rates.tsv"), os.path.join( self.output_dir, "{}.mutation".format(self.clade))) @@ -93,7 +66,7 @@ def compute_phylogeny(self, samples_markers_dir, num_samples, min_markers, tempo if self.treeshrink: info("Executing TreeShrink...") execute_treeshrink(os.path.join(self.output_dir, 'RAxML_bestTree.{}.StrainPhlAn4.tre'.format( - self.clade)), self.output_dir, tmp=self.tmp_dir, centroid=True if num_samples >= 100 else False) + self.clade)), self.output_dir, tmp=tmp_dir, centroid=True if num_samples >= 100 else False) info("Done.") info('This StrainPhlAn analysis ran TreeShrink, do not forget to cite:') info('Mai, Uyen, and Siavash Mirarab. 2018. “TreeShrink: Fast and Accurate Detection of Outlier Long Branches in Collections of Phylogenetic Trees.” BMC Genomics 19 (S5): 272. https://doi.org/10.1186/s12864-018-4620-2.')