pughlab_emseq_pipeline.pl

#!/usr/bin/env perl
### pughlab_emseq_pipeline.pl ######################################################################
use AutoLoader 'AUTOLOAD';
use strict;
use warnings;
use Carp;
use POSIX qw(strftime);
use Getopt::Std;
use Getopt::Long;
use File::Basename;
use File::Path qw(make_path);
use YAML qw(LoadFile);
use List::Util qw(any none);
use Data::Dumper;

my $cwd = dirname(__FILE__);
require "$cwd/scripts/utilities.pl";

####################################################################################################
# version       author	  	comment
# 1.0		sprokopec       script to run PughLab EMSeq pipeline

### USAGE ##########################################################################################
# pughlab_emseq_pipeline.pl -c tool_config.yaml -d data.yaml
#
# where:
#	- tool_config.yaml contains tool versions and parameters, output directory, reference
#	information, etc.
#	- data_config.yaml contains sample information (YAML file containing paths to FASTQ files,
#	generated by create_fastq_yaml.pl)

### SUBROUTINES ####################################################################################

### MAIN ###########################################################################################
sub main {
	my %args = (
		tool_config	=> undef,
		data_config	=> undef,
		step1		=> undef,
		step2		=> undef,
		step3		=> undef,
		step4		=> undef,
		step5		=> undef,
		cleanup		=> undef,
		cluster		=> undef,
		dry_run		=> undef,
		@_
		);

	my $tool_config = $args{tool_config};
	my $data_config = $args{data_config};

	### PREAMBLE ######################################################################################

	# load tool config
	my $tool_data = LoadFile($tool_config);
	my $date = strftime "%F", localtime;
	my $timestamp = strftime "%F_%H-%M-%S", localtime;

	# check for and/or create output directories
	my $output_directory = $tool_data->{output_dir};
	$output_directory =~ s/\/$//;
	my $log_directory = join('/', $output_directory, 'logs', 'run_EMSeq_pipeline_' . $timestamp);

	unless(-e $output_directory) { make_path($output_directory); }
	unless(-e $log_directory) { make_path($log_directory); }

	# start logging
	my $log_file = join('/', $log_directory, 'run_EMSeq_pipeline.log');
	open (my $log, '>', $log_file) or die "Could not open $log_file for writing.";

	print $log "---\n";
	print $log "Running PughLab EM-Seq pipeline.\n";
	print $log "\n  Tool config used: $tool_config";
	print $log "\n    Output directory: $output_directory";
	print $log "\n  Sample config used: $data_config";
	print $log "\n---\n\n";

	my $seq_type = $tool_data->{seq_type};

	# indicate maximum time limit for parent jobs to wait
	my $max_time = '5-00:00:00';

	my $perl = 'perl/' . $tool_data->{perl_version};

	# get optional HPC group
	my $hpc_group = defined($tool_data->{hpc_group}) ? "-A $tool_data->{hpc_group}" : undef;

	### MAIN ###########################################################################################

	my ($run_script, $trim_run_id, $fastqc_run_id, $bwa_run_id, $qc_run_id, $md_run_id);
	my (@step1_job_ids, @step2_job_ids, @step3_job_ids, @step4_job_ids, @job_ids);
	my $current_dependencies = '';

	# prepare directory structure
	my $trim_directory = join('/', $output_directory, 'fastq_trimmed');
	my $fastqc_directory = join('/', $output_directory, 'fastqc');
	my $bwa_directory = join('/', $output_directory, 'BWA');
	my $qc_directory = join('/', $output_directory, 'BAMQC');
	my $methylD_directory = join('/', $output_directory, 'MethylDackel');

	# check which tools have been requested
	my %tool_set = (
		'trim_adapters' => defined($tool_data->{trim_adapters}->{run}) ? $tool_data->{trim_adapters}->{run} : 'N',
		'fastqc' => defined($tool_data->{fastqc}->{run}) ? $tool_data->{fastqc}->{run} : 'N',
		'bwa'	=> defined($tool_data->{bwa}->{run}) ? $tool_data->{bwa}->{run} : 'N',
		'bamqc'	=> defined($tool_data->{bamqc}->{run}) ? $tool_data->{bamqc}->{run} : 'N',
		'methylD' => defined($tool_data->{methyldackel}->{run}) ? $tool_data->{methyldackel}->{run} : 'N'
		);

	print $log Dumper \%tool_set;

	# indicate YAML files for processed files
	my $fastq_trimmed_output_yaml = join('/', $trim_directory, 'fastq_trimmed_config.yaml');
	my $bwa_output_yaml = join('/', $bwa_directory, 'bwa_bam_config_' . $timestamp . '.yaml');

	# are we running step1 (fastq prep)?
	if ( (!$args{step1}) ) {
		$fastq_trimmed_output_yaml = $data_config;
		}

	# are we running step2 (alignments) or are BAMs provided as input?
	if ( (!$args{step2}) ) {
		$bwa_output_yaml = $data_config;
		}

	# Should pre-processing of fastqs (adapter trimming + QC) be performed?
	if ($args{step1}) {

		## run AdapterTrim pipeline
		unless(-e $trim_directory) { make_path($trim_directory); }

		if ('Y' eq $tool_set{'trim_adapters'}) {

			my $trim_command = join(' ',
				"perl $cwd/scripts/trim_adapters.pl",
				"-o", $trim_directory,
				"-t", $tool_config,
				"-d", $data_config,
				"-b", $fastq_trimmed_output_yaml,
				"-c", $args{cluster}
				);

			# record command (in log directory) and then run job
			print $log "Submitting job for trim_adapters.pl\n";
			print $log "  COMMAND: $trim_command\n\n";

			$run_script = write_script(
				log_dir	=> $log_directory,
				name	=> 'pughlab_dna_pipeline__run_trim_adapters',
				cmd	=> $trim_command,
				modules	=> [$perl],
				mem		=> '256M',
				max_time	=> '48:00:00',
				hpc_driver	=> $args{cluster},
				extra_args	=> [$hpc_group]
				);

			if ($args{dry_run}) {

				$trim_command .= " --dry-run";
				`$trim_command`;
				$trim_run_id = 'pughlab_dna_pipeline__run_trim_adapters';

				} else {

				$trim_run_id = submit_job(
					jobname		=> $log_directory,
					shell_command	=> $run_script,
					hpc_driver	=> $args{cluster},
					dry_run		=> $args{dry_run},
					log_file	=> $log
					);

				print $log ">>> AdapterTrim job id: $trim_run_id\n\n";
				push @job_ids, $trim_run_id;
				}
			}

		## run FASTQC pipeline
		unless(-e $fastqc_directory) { make_path($fastqc_directory); }

		if ('Y' eq $tool_set{'fastqc'}) {

			my $fastqc_command = join(' ',
				"perl $cwd/scripts/collect_fastqc_metrics.pl",
				"-o", $fastqc_directory,
				"-t", $tool_config,
				"-d", $fastq_trimmed_output_yaml,
				"-c", $args{cluster}
				);

			# record command (in log directory) and then run job
			print $log "Submitting job for collect_fastqc_metrics.pl\n";
			print $log "  COMMAND: $fastqc_command\n\n";

			$run_script = write_script(
				log_dir	=> $log_directory,
				name	=> 'pughlab_dna_pipeline__run_fastqc',
				cmd	=> $fastqc_command,
				modules	=> [$perl],
				dependencies	=> $trim_run_id,
				mem		=> '256M',
				max_time	=> '48:00:00',
				hpc_driver	=> $args{cluster},
				extra_args	=> [$hpc_group]
				);

			if ($args{dry_run}) {

				$fastqc_command .= " --dry-run";
				`$fastqc_command`;
				$fastqc_run_id = 'pughlab_dna_pipeline__run_fastqc';

				} else {

				$fastqc_run_id = submit_job(
					jobname		=> $log_directory,
					shell_command	=> $run_script,
					hpc_driver	=> $args{cluster},
					dry_run		=> $args{dry_run},
					log_file	=> $log
					);

				print $log ">>> FASTQC job id: $fastqc_run_id\n\n";
				push @job_ids, $fastqc_run_id;
				}
			}
		}

	# Should alignment be performed?
	if ($args{step2}) {

		## run BWA-alignment pipeline
		unless(-e $bwa_directory) { make_path($bwa_directory); }

		if ('Y' eq $tool_set{'bwa'}) {

			my $bwa_command = join(' ',
				"perl $cwd/scripts/bwa.pl",
				"-o", $bwa_directory,
				"-t", $tool_config,
				"-d", $fastq_trimmed_output_yaml,
				"-b", $bwa_output_yaml,
				"-c", $args{cluster}
				);

			if ($args{cleanup}) {
				$bwa_command .= " --remove";
				}

			# record command (in log directory) and then run job
			print $log "Submitting job for bwa.pl\n";
			print $log "  COMMAND: $bwa_command\n\n";

			$run_script = write_script(
				log_dir	=> $log_directory,
				name	=> 'pughlab_dna_pipeline__run_bwa',
				cmd	=> $bwa_command,
				modules	=> [$perl],
				dependencies	=> $trim_run_id,
				mem		=> '256M',
				max_time	=> $max_time,
				hpc_driver	=> $args{cluster},
				extra_args	=> [$hpc_group]
				);

			if ($args{dry_run}) {

				$bwa_command .= " --dry-run";
				`$bwa_command`;
				$bwa_run_id = 'pughlab_dna_pipeline__run_bwa';

				} else {

				$bwa_run_id = submit_job(
					jobname		=> $log_directory,
					shell_command	=> $run_script,
					hpc_driver	=> $args{cluster},
					dry_run		=> $args{dry_run},
					log_file	=> $log
					);

				print $log ">>> BWA job id: $bwa_run_id\n\n";
				push @job_ids, $bwa_run_id;
				}
			}
		}

	## Collect various alignment metrics
	if ($args{step3}) {

		unless(-e $qc_directory) { make_path($qc_directory); }

		if ('Y' eq $tool_set{'bamqc'}) {

			# QC (Picard QC functions) pipeline
			my $qc_command = join(' ',
				"perl $cwd/scripts/get_sequencing_metrics.pl",
				"-o", $qc_directory,
				"-t", $tool_config,
				"-d", $bwa_output_yaml,
				"-c", $args{cluster}
				);

			if ($args{cleanup}) {
				$qc_command .= " --remove";
				}

			# record command (in log directory) and then run job
			print $log "Submitting job for get_sequencing_metrics.pl\n";
			print $log "  COMMAND: $qc_command\n\n";

			$run_script = write_script(
				log_dir	=> $log_directory,
				name	=> 'pughlab_dna_pipeline__run_qc',
				cmd	=> $qc_command,
				modules	=> [$perl],
				dependencies	=> $bwa_run_id,
				mem		=> '256M',
				max_time	=> $max_time,
				extra_args	=> [$hpc_group],
				hpc_driver	=> $args{cluster}
				);

			if ($args{dry_run}) {

				$qc_command .= " --dry-run";
				`$qc_command`;
				$qc_run_id = 'pughlab_dna_pipeline__run_qc';

				} else {

				$qc_run_id = submit_job(
					jobname		=> $log_directory,
					shell_command	=> $run_script,
					hpc_driver	=> $args{cluster},
					dry_run		=> $args{dry_run},
					log_file	=> $log
					);

				print $log ">>> QC job id: $qc_run_id\n\n";
				push @job_ids, $qc_run_id;
				}
			}
		}


	## Should methylation analyses be performed?
	if ($args{step4}) {

		unless(-e $methylD_directory) { make_path($methylD_directory); }

		if ('Y' eq $tool_set{'methylD'}) {

			# methyldackel to extract site-wise methylation
			my $md_command = join(' ',
				"perl $cwd/scripts/methyldackel.pl",
				"-o", $methylD_directory,
				"-t", $tool_config,
				"-d", $bwa_output_yaml,
				"-c", $args{cluster}
				);

#			if ($args{cleanup}) {
#				$qc_command .= " --remove";
#				}

			# record command (in log directory) and then run job
			print $log "Submitting job for methyldackel.pl\n";
			print $log "  COMMAND: $md_command\n\n";

			$run_script = write_script(
				log_dir	=> $log_directory,
				name	=> 'pughlab_dna_pipeline__run_methyldackel',
				cmd	=> $md_command,
				modules	=> [$perl],
				dependencies	=> $qc_run_id,
				mem		=> '256M',
				max_time	=> $max_time,
				extra_args	=> [$hpc_group],
				hpc_driver	=> $args{cluster}
				);

			if ($args{dry_run}) {

				$md_command .= " --dry-run";
				`$md_command`;
				$md_run_id = 'pughlab_dna_pipeline__run_methyldackel';

				} else {

				$md_run_id = submit_job(
					jobname		=> $log_directory,
					shell_command	=> $run_script,
					hpc_driver	=> $args{cluster},
					dry_run		=> $args{dry_run},
					log_file	=> $log
					);

				print $log ">>> MethylDackel job id: $md_run_id\n\n";
				push @job_ids, $md_run_id;
				}
			}
		}

	# finish up
	print $log "\nProgramming terminated successfully.\n\n";
	close $log;

	}

### GETOPTS AND DEFAULT VALUES #####################################################################
# declare variables
my ($tool_config, $data_config);
my ($fastq_prep, $alignment, $qc, $analysis, $summarize, $create_report);
my $hpc_driver = 'slurm';
my ($remove_junk, $dry_run);
my $help;

# read in command line arguments
GetOptions(
	'h|help'		=> \$help,
	't|tool=s'		=> \$tool_config,
	'd|data=s'		=> \$data_config,
	'fastq_prep'		=> \$fastq_prep,
	'alignment'		=> \$alignment,
	'qc'			=> \$qc,
	'analysis'		=> \$analysis,
#	'summarize'		=> \$summarize,
#	'create_report'		=> \$create_report,
	'c|cluster=s'		=> \$hpc_driver,
	'remove'		=> \$remove_junk,
	'dry-run'		=> \$dry_run
	 );

if ($help) {
	my $help_msg = join("\n",
		"Options:",
		"\t--help|-h\tPrint this help message",
		"\t--data|-d\t<string> data config (yaml format)",
		"\t--tool|-t\t<string> tool config (yaml format)",
		"\t--fastq_prep\t<boolean> should fastqs be trimmed? (default: false)",
		"\t--alignment\t<boolean> should read alignment be performed? (default: false)",
		"\t--qc\t\t<boolean> should QC metrics be generated on the BAMs? (default: false)",
		"\t--analysis\t<boolean> should methylation analyses be performed? (default: false)",
#		"\t--summarize\t<boolean> should output be summarized? (default: false)",
#		"\t--create_report\t<boolean> should a report be generated? (default: false)",
		"\t--cluster|-c\t<string> cluster scheduler (default: slurm)",
		"\t--remove\t<boolean> should intermediates be removed? (default: false)",
		"\t--dry-run\t<boolean> should jobs be submitted? (default: false)"
		);

	print "$help_msg\n";
	exit;
	}

if ( (!$alignment) && (!$analysis) && (!$summarize) && (!$qc) && (!$create_report) ) {
	die("Please choose a step to run (at least one of --alignment, --qc, --analysis, --summarize, --create_report )");
	}
if (!defined($tool_config)) {
	die("No tool config file defined; please provide -t | --tool (ie, tool_config.yaml)");
	}
if (!defined($data_config)) {
	die("No data config file defined; please provide -d | --data (ie, sample_config.yaml)");
	}

# check for compatible HPC driver; if not found, change dry_run to Y
my @compatible_drivers = qw(slurm);
if ( (none { $_ =~ m/$hpc_driver/ } @compatible_drivers ) && (!$dry_run) ) {
	print "Unrecognized HPC driver requested: setting dry_run to true -- jobs will not be submitted but commands will be written to file.\n";
	$dry_run = 1;
	}

main(
	tool_config	=> $tool_config,
	data_config	=> $data_config,
	step1		=> $fastq_prep,
	step2		=> $alignment,
	step3		=> $qc,
	step4		=> $analysis,
#	step4		=> $summarize,
#	step5		=> $create_report,
	cluster		=> $hpc_driver,
	cleanup		=> $remove_junk,
	dry_run		=> $dry_run
	);