From a754f8b6e741c011584c11f6963b78e6872400fa Mon Sep 17 00:00:00 2001
From: "Jose F. Sanchez" <jfsanchezherrero@ub.edu>
Date: Wed, 7 Dec 2016 11:40:56 +0100
Subject: [PATCH] Overlapping and sliding changed

---
 src/perl/DM_MarkerScan_v1.0.2.pl | 321 +++++++++++++++++--------------
 1 file changed, 173 insertions(+), 148 deletions(-)

diff --git a/src/perl/DM_MarkerScan_v1.0.2.pl b/src/perl/DM_MarkerScan_v1.0.2.pl
index 8326475..084f0f8 100644
--- a/src/perl/DM_MarkerScan_v1.0.2.pl
+++ b/src/perl/DM_MarkerScan_v1.0.2.pl
@@ -629,12 +629,24 @@ =head1 ARGUMENTS
 
 Use this option if you expect your data to have really low coverage. Sensibility and precision of the markers identified could be decrease.
 
-=item B<-SI|--sliding_interval [int_value] [Default 1]>
+=item B<-SI|--sliding_interval [int_value] [Default 10]>
 
 When discovering new markers, DOMINO checks each region using a sliding window approach. This option SI is the increment to loop around this sequence. 
 
 If too much markers are retrieved, please increment this value to obtain more spaced regions.
 
+=item B<-C-SI_inc [int_value] [Default: Self-Adjustment]>
+
+When discovering new markers, DOMINO checks each region using a sliding window approach. This options sets the interval to move if given a range for conserved region size.
+
+If not given, the interval would be adjusted according to the range provided.
+
+=item B<-V-SI_inc [int_value] [Default: Self-Adjustment]>
+
+When discovering new markers, DOMINO checks each region using a sliding window approach. This options sets the interval to move if given a range for the variable region size.
+
+If not given, the interval would be adjusted according to the range provided.
+
 =item B<-dnaSP [Default Off]>
 
 Use this option along with RADseq file or MSA [file|folder] to report any alignment with a minimun number of variations independently of the taxa given.
@@ -879,8 +891,12 @@ =head1 CITATION
 if (!$variable_positions_user_range and !$variable_divergence) {
 	&printError("Exiting the script. A range for variable positions or a minimum divergence is missing.\n Use the option -VP|--variable_positions [min::max] or -VD|--variable_divergence [float number]..\n"); DOMINO::dieNicely();
 }
-unless (!$variable_divergence) { if ($variable_divergence < 0) {$variable_divergence = 0.000000000000000000000000000000001;} ## Set a very small value if -VD 0 
-} push (@{ $domino_params{'marker'}{'variable_divergence'} }, $variable_divergence);
+unless (!$variable_divergence) { 
+	if ($variable_divergence =~ /.*\,.*/) { $variable_divergence =~ s/\,/\./; }
+	if ($variable_divergence < 0) {$variable_divergence = 0.000000000000000000000000000000001;} ## Set a very small value if -VD 0 
+}
+push (@{ $domino_params{'marker'}{'variable_divergence'} }, $variable_divergence);
+
 
 if ($option eq "DOMINO_files") {
 	if (!$input_type) { &printError("-type_input option is missing...\nPlease provide it in order to proceed with the computation...."); DOMINO::dieNicely(); }
@@ -931,11 +947,11 @@ =head1 CITATION
 if (!$rfgopen) { $rfgopen = 5; } 			## Bowtie Defaults
 if (!$rfgexten) { $rfgexten = 3; } 			## Bowtie Defaults
 if (!$mis_penalty) { $mis_penalty = 4;} 	## Bowtie Defaults
-if (!$SLIDING_user) { $SLIDING_user = 10; } 	## sliding interval for marker search
+if (!$SLIDING_user) { $SLIDING_user = 10; } ## Default DOMINO sliding interval for marker search
 if (!$cigar_pct) { $cigar_pct = 10; }
 if (!$window_var_CONS) { $window_var_CONS = 1;}
 if (!$level_significance_coverage_distribution) { $level_significance_coverage_distribution = 1e-05; }
-if (!$missing_allowed) { $missing_allowed = 0.1;} ## Def. 0.05: When looking for a marker if 5% of the length is missing for any specie, discard 
+if (!$missing_allowed) { $missing_allowed = 0.1;} ## Def. 0.1: When looking for a marker if 10% of the length is missing for any specie, discard 
 
 ## Get ranges
 my ($variable_positions_user_min, $variable_positions_user_max);
@@ -1250,8 +1266,7 @@ =head1 CITATION
 	DOMINO::printDetails("\t- Variable Length (VL): $window_size_VARS_min -- $window_size_VARS_max (bp)\n", $param_Detail_file_markers);
 	DOMINO::printDetails("\t- Sliding window increment (SI): $SLIDING_user (bp)\n", $param_Detail_file_markers);
 	DOMINO::printDetails("\t- Sliding window increment Variable region (V-SI_inc): $VAR_inc (bp)\n", $param_Detail_file_markers);
-	DOMINO::printDetails("\t- Sliding window increment Conserved region (C-SI_inc): $CONS_inc (bp)\n", $param_Detail_file_markers);
-	
+	DOMINO::printDetails("\t- Sliding window increment Conserved region (C-SI_inc): $CONS_inc (bp)\n", $param_Detail_file_markers);	
 }
 
 if ($variable_divergence) {
@@ -1295,10 +1310,8 @@ =head1 CITATION
 ################################################################################################
 ################# 		Mapping/Alignment of the contigs 		################################
 ################################################################################################
-if (!$avoid_mapping) {
-	
+if (!$avoid_mapping) {	
 	if ($option ne "msa_alignment") {
-
 		## We would use Bowtie2 for mapping the reads		
 		DOMINO::printHeader("", "#");	DOMINO::printHeader(" Mapping Process started ", "#"); DOMINO::printHeader("", "#"); print "\n";
 		
@@ -2215,7 +2228,7 @@ =head1 CITATION
 					# Dump into file # print Dumper \%domino_files_msa;
 					DOMINO::printDump(\%domino_files_msa, $dump_folder_files);	
 		}}}
-		$pm_MARKER_MSA_files->finish;
+		$pm_MARKER_MSA_files->finish();
 	}
 	$pm_MARKER_MSA_files->wait_all_children;
 	print "\n\n";
@@ -2293,6 +2306,7 @@ =head1 CITATION
 	} else { File::Copy::move($mapping_markers_errors_details, $marker_dirname."/"); }	## Finish and exit
 	&finish_time_stamp(); print "\n\n Job done succesfully, exiting the script\n\n\n"; exit();
 }
+&time_log(); print "\n";
 
 ## Other types of data
 ##################################
@@ -2472,18 +2486,16 @@ =head1 CITATION
 			push (@{ $pileup_files_threads{$contigs}{'mergeProfile'} }, $file);
 			print OUT_PILEUP ">$contigs\n$tmp_string\n"; 
 			my $fileReturned = &sliding_window_conserve_variable(\$contigs, \$tmp_string, $PILEUP_merged_folder_abs_path);
-			if ($fileReturned eq 0) {  
-				undef $pileup_files_threads{$contigs}; $pm_MARKER_PILEUP->finish(); 
-			} else { 
-				push (@{ $pileup_files_threads{$contigs}{'mergeCoord'} }, $$fileReturned);
-			}
+			if (-e -r -s $fileReturned) {
+				push (@{ $pileup_files_threads{$contigs}{'mergeCoord'} }, $fileReturned);
+			} else { $pm_MARKER_PILEUP->finish(); }
 		} else { $pm_MARKER_PILEUP->finish(); }
 		close (OUT_PILEUP); 
 		
 		######################################################################
 		## Check the coordinates foreach taxa against the merge statistics  ##
 		######################################################################
-
+		
 		foreach my $taxa (sort keys %domino_files) {
 			unless ($domino_files{$taxa}{'taxa'}) { next; }
 			if ($taxa eq $ref_taxa) {next;}
@@ -2501,58 +2513,61 @@ =head1 CITATION
 				&get_coordinates_each_taxa(\$pileup_each_taxa, \$pileup_files_threads{$contigs}{'mergeCoord'}[0], $taxa, \$output_file, \$error_file);
 				push (@{ $pileup_files_threads{$contigs}{'eachTaxaCoord'} }, $output_file);
 		}}
-		unless (!$pileup_files_threads{$contigs}{'mergeCoord'}[0]) { 
-			##########################################
-			## Get Coordinates of Molecular Markers ##
-			##########################################		
-			my $coord_retrieved_ref = &get_shared_coordinates(\@{$pileup_files_threads{$contigs}{'eachTaxaCoord'} }, \$pileup_files_threads{$contigs}{'mergeCoord'}[0], $ref_taxa);
-			if (!$coord_retrieved_ref) {$pm_MARKER_PILEUP->finish();}
-			## Print in tmp file for sorting and obtaining unique
-			chdir $PILEUP_merged_folder_abs_path;
-			my $tmp_file = $PILEUP_merged_folder_abs_path."/".$contigs."_markers_shared.txt";
-			open(TMP, ">$tmp_file");
-			for my $scaffold (sort keys %{ $coord_retrieved_ref } ) {    		
-				## Check we find markers for all the taxa desired
-				my @string = split(";", $scaffold);
-				my @array_taxa_split = @{ $$coord_retrieved_ref{$scaffold}{'taxa'} };
-				my $species2find = $minimum_number_taxa_covered;
-				if (scalar @array_taxa_split < $species2find) { next; }   		
-				## Write DOMINO Markers Coordinates in tmp txt file
-				my $string = join("\t", @string);
-				my @sort_taxa = sort(@array_taxa_split);
-				my $string_taxa = join(",", @sort_taxa);
-				print TMP "$string\t$string_taxa\n"; 
-			} close(TMP);	
-			unless (-e -r -s $tmp_file) { $pm_MARKER_PILEUP->finish(); }
-			
-			## Collapse markers
-			my $file_markers_collapse = &check_overlapping_markers($tmp_file, \$pileup_files_threads{$contigs}{'mergeProfile'}[0]);
-
-			# Retrieve fasta sequences...
-			my %fasta_msa;
-			foreach my $taxa (sort keys %domino_files) {
-				unless ($domino_files{$taxa}{'taxa'}) { next; }
-				if ($taxa eq $ref_taxa) {next;}
+		unless (-e -r -s $pileup_files_threads{$contigs}{'mergeCoord'}[0]) { $pm_MARKER_PILEUP->finish(); }
+
+		##########################################
+		## Get Coordinates of Molecular Markers ##
+		##########################################		
+		my $coord_retrieved_ref = &get_shared_coordinates(\@{$pileup_files_threads{$contigs}{'eachTaxaCoord'} }, \$pileup_files_threads{$contigs}{'mergeCoord'}[0], $ref_taxa);
+		if (!$coord_retrieved_ref) {$pm_MARKER_PILEUP->finish();}
+		## Print in tmp file for sorting and obtaining unique
+		chdir $PILEUP_merged_folder_abs_path;
+		my $tmp_file = $PILEUP_merged_folder_abs_path."/".$contigs."_markers_shared.txt";
+		open(TMP, ">$tmp_file");
+		for my $scaffold (sort keys %{ $coord_retrieved_ref } ) {    		
+			## Check we find markers for all the taxa desired
+			my @string = split(";", $scaffold);
+			my @array_taxa_split = @{ $$coord_retrieved_ref{$scaffold}{'taxa'} };
+			if (scalar @array_taxa_split < $minimum_number_taxa_covered) { next; }   		
+			## Write DOMINO Markers Coordinates in tmp txt file
+			my $string = join("\t", @string);
+			my @sort_taxa = sort(@array_taxa_split);
+			my $string_taxa = join(",", @sort_taxa); #taxa providing variation
+			print TMP "$string\t$string_taxa\n"; 
+		} close(TMP);	
+		unless (-e -r -s $tmp_file) { $pm_MARKER_PILEUP->finish(); }
+
+		## Collapse markers
+		my $file_markers_collapse = &check_overlapping_markers($tmp_file, \$pileup_files_threads{$contigs}{'mergeProfile'}[0]);
+		
+		# Retrieve fasta sequences...
+		my %fasta_msa;
+		foreach my $taxa (sort keys %domino_files) {
+			unless ($domino_files{$taxa}{'taxa'}) { next; }
+			if ($taxa eq $ref_taxa) {next;}
+			if ($pileup_files{$contigs}{$taxa."_FASTA"}) {
 				$fasta_msa{$contigs}{$taxa} = $pileup_files{$contigs}{$taxa."_FASTA"};
-			}
-			my $reference_file_contig = $domino_files{$ref_taxa}{'REF_DIR'}[0]."/".$contigs.".fasta";
-			if (-e -r -s $reference_file_contig) {
-				$fasta_msa{$contigs}{$ref_taxa} = $reference_file_contig;
-				my $output_file = $PILEUP_merged_folder_abs_path."/".$contigs."_markers_retrieved.txt";
-				push (@{ $pileup_files_threads{$contigs}{'markers'} }, $output_file);
-				my $markers_print_ref = &check_DOMINO_marker($output_file, \%fasta_msa, $dir2print_markers, $file_markers_collapse, $contigs);
-				unless (scalar @$markers_print_ref == 0) { 
-					push (@{ $pileup_files_threads{$contigs}{'markers_files'} }, @$markers_print_ref);
-					my $dump_folder_files = $dir_Dump_file."/dump_markers_".$contigs.".txt";
-					DOMINO::printDump(\%pileup_files_threads, $dump_folder_files);	
-		}}}
+			} else { $fasta_msa{$contigs}{$taxa} = "missing"; }
+		}
+		my $reference_file_contig = $domino_files{$ref_taxa}{'REF_DIR'}[0]."/".$contigs.".fasta";
+		unless (-e -r -s $reference_file_contig) {$pm_MARKER_PILEUP->finish();}
+		$fasta_msa{$contigs}{$ref_taxa} = $reference_file_contig;
+		my $output_file = $PILEUP_merged_folder_abs_path."/".$contigs."_markers_retrieved.txt";
+		push (@{ $pileup_files_threads{$contigs}{'markers'} }, $output_file);
+		my $markers_print_ref = &check_DOMINO_marker($output_file, \%fasta_msa, $dir2print_markers, $file_markers_collapse, $contigs);
+		unless (scalar @$markers_print_ref == 0) { 
+			push (@{ $pileup_files_threads{$contigs}{'markers_files'} }, @$markers_print_ref);
+			my $dump_folder_files = $dir_Dump_file."/dump_markers_".$contigs.".txt";
+			DOMINO::printDump(\%pileup_files_threads, $dump_folder_files);	
+		}
 		$pm_MARKER_PILEUP->finish(); # finish for each contig
-	} $pm_MARKER_PILEUP->wait_all_children; #each marker
+	} 
+	$pm_MARKER_PILEUP->wait_all_children; #each marker
 	print "\n\n";
 	print "******************************************************\n";
 	print "**** All parallel parsing processes have finished ****\n";
 	print "******************************************************\n\n";
-
+	&time_log(); print "\n";
 	## Retrieve info of all markers identified...
 	my $dump_files = DOMINO::readDir($dir_Dump_file);
 	for (my $i=0; $i < scalar @$dump_files; $i++) {
@@ -2651,14 +2666,14 @@ =head1 CITATION
 print "\n\n"; DOMINO::printHeader("", "#"); DOMINO::printHeader(" Clustering markers for unique results ", "#"); DOMINO::printHeader("", "#");
 my $blast_dir = $marker_dirname."/clustering";
 mkdir $blast_dir, 0755; chdir $blast_dir; 	&debugger_print("Changing dir to $blast_dir");
+
 # read dir
 my $files_dir_ref = DOMINO::readDir($marker_dirname);
 my @markers_folders = @$files_dir_ref;
+
 # output files
-my $all_coordinates_file = $blast_dir."/all_coordinates.fasta";
-my $all_contigs_file = $blast_dir."/all_contigs.fasta";
-open (ALL_CONTIGS, ">$all_contigs_file"); 
-open (ALL_coordinates, ">$all_coordinates_file");
+my $all_coordinates_file = $blast_dir."/all_coordinates.fasta"; open (ALL_coordinates, ">$all_coordinates_file");
+my $all_contigs_file = $blast_dir."/all_contigs.fasta"; open (ALL_CONTIGS, ">$all_contigs_file"); 
 print "+ Merging different DOMINO markers according to the taxa of reference...\n";
 
 # get sequences
@@ -2668,17 +2683,16 @@ =head1 CITATION
 		push (@CoordMarker, $domino_files{$keys}{'coordinates'}[0]);
 		my $coordinate_file = $domino_files{$keys}{'markers'}[0];
 		if (-e -r -s $coordinate_file) {
-			open (COORD_FILE, $coordinate_file); while (<COORD_FILE>) { 
-				if ($_ =~ /^>(.*)/) {
-					my $line = $_;
-					chomp $line;
-					print ALL_coordinates $line."_taxa_".$keys."\n";
-				} else { print ALL_coordinates $_; }
-			} close(COORD_FILE);	
-		}
+			my $hash = DOMINO::readFASTA_hash($coordinate_file);
+			foreach my $seq (keys %{$hash}) {
+				print ALL_coordinates ">".$seq."_taxa_".$keys."\n".$$hash{$seq}."\n";
+		}}
 		my $contig_file = $domino_files{$keys}{'CONTIGS'}[0];
 		if (-e -r -s $contig_file) {
-			open (CONTIG_FILE, $contig_file); while (<CONTIG_FILE>) { print ALL_CONTIGS $_; } close(CONTIG_FILE);
+			my $size_file = DOMINO::get_size($contig_file);
+			open (FH, $contig_file);
+			my $chunk; read(FH, $chunk, $size_file); 
+			print ALL_CONTIGS $chunk;
 }}} close(ALL_CONTIGS); close(ALL_coordinates);
 
 ## Use BLAST for clustering sequences
@@ -2700,6 +2714,7 @@ =head1 CITATION
 my (%markers2keep, @markers_seen, %clusterized_contigs_keep);
 my $first_hit = 0;
 my $aln_overlapped = $window_size_VARS_max + $window_size_CONS_max;
+print "+ Clustering markers with > $aln_overlapped bp overlapped...\n";
 open (BLAST, $blast_search); while (<BLAST>) {
 	my $line = $_;
 	chomp $line;
@@ -2708,11 +2723,11 @@ =head1 CITATION
 	if ($query eq $subject) { next;} # same sequence
 	my $query_string; my $subject_taxa;
 	my $subject_string; my $taxa_query;
-	if ($query =~ /(.*)\_taxa\_(.*)/) { 	$query_string = $1; 	$taxa_query = $2; }
-	if ($subject =~ /(.*)\_taxa\_(.*)/){	$subject_string = $1;	$subject_taxa = $2; }
+	if ($query =~ /(.*)\_taxa\_(.*)/) { $query_string = $1;   $taxa_query = $2; }
+	if ($subject =~ /(.*)\_taxa\_(.*)/){$subject_string = $1; $subject_taxa = $2; }
 	if ($subject_taxa eq $taxa_query) { next; } ## if same taxa
 	if ($array[10] < 1e-20 && $array[3] > $aln_overlapped) {    ## how to clusterize...
-		if ($query =~ /(.*)_coord_(.*)_taxa_(.*)/) { $clusterized_contigs_keep{$1}++; }
+		if ($query =~ /(.*)\_coord\_(.*)\_taxa\_(.*)/) { $clusterized_contigs_keep{$1}++;}
 		if ($first_hit == 0) {
 			$first_hit++;
 			push( @{$markers2keep{$query_string} }, $subject_string); 
@@ -2723,13 +2738,13 @@ =head1 CITATION
 				if (grep /.*$subject_string.*/, @{$markers2keep{$keys}}) { $flag_this = 1; last; }
 				if (grep /.*$query_string.*/, @{$markers2keep{$keys}}) { $flag_this = 1; last; }
 			}
-			if ($flag_this == 0) {
-				push(@{$markers2keep{$query_string}}, $subject_string); 
-				push (@markers_seen, $subject_string);
-			}
+		if ($flag_this == 0) { 	push(@{$markers2keep{$query_string}}, $subject_string);  push (@markers_seen, $subject_string); }
 }}} close (BLAST);
 foreach my $keys (sort keys %$contig_length_Ref) { 
-if ($keys =~ /(.*)\_taxa\_(.*)/) { unless (grep /$1/, @markers_seen) { $markers2keep{$1}++;}}}
+	if ($keys =~ /((.*)\_coord\_(.*))\_taxa\_(.*)/) { 
+		unless (grep /$1/, @markers_seen) { 
+			$markers2keep{$1}++; $clusterized_contigs_keep{$2}++;
+}}}
 
 ## Printing definitely Results
 my $definitely_results_dirname = $marker_dirname."/DOMINO_markers_Results";
@@ -2739,14 +2754,13 @@ =head1 CITATION
 open (CONTIGS_DEF, ">$contig_def_results_sequences");
 my $hash_contigs_ref = DOMINO::readFASTA_hash($all_contigs_file);
 my %hash_contigs = %$hash_contigs_ref;
-foreach my $keys (sort keys %hash_contigs) { 
+foreach my $keys (sort keys %hash_contigs) {
 if ($clusterized_contigs_keep{$keys}) { print CONTIGS_DEF ">".$keys."\n".$hash_contigs{$keys}."\n";	}}
 close(CONTIGS_DEF);
 
 my %hash4markers2keep;
 print "+ Filtering coordinates files...\n\n";
 for (my $h = 0; $h < scalar @CoordMarker; $h++) {
-	## Check for coordinates		
 	open (tmp_COOR, $CoordMarker[$h]);
 	while (<tmp_COOR>) {
 		my $line = $_;
@@ -2755,23 +2769,23 @@ =head1 CITATION
 		if ($line =~ /.*Vari/) {next;}
 		my @array_split = split("\t",$line);
 		my @contig_split = split("\_#",$array_split[0]);
-		if ($clusterized_contigs_keep{ $contig_split[0] } ) {
+		if ($clusterized_contigs_keep{$contig_split[0]}) {
 			my @array_start = split(":", $array_split[1]);
 			my @array_end = split(":", $array_split[3]);
 			my $coord_string = $array_start[0]."_".$array_end[1];
 			my $coord2check = $contig_split[0]."_coord_".$coord_string;
 			if ($markers2keep{$coord2check}) {
-				$hash4markers2keep{$contig_split[0]}{$array_start[0]} = $line;
-	}}} close(tmp_COOR);		
-}
+				$hash4markers2keep{$contig_split[0]}{$coord_string} = $line;
+}}} close(tmp_COOR); }
 
 my $coordinates_def_results = "DM_markers-coordinates.txt";
 open (COOR, ">$coordinates_def_results");
 print COOR "Contig\t\tConserved_Region\tVariable_Region\tConserved_Region\tMapping_Taxa\Length\tDivergence\n";
 my %rename;
+
 foreach my $contigs (sort keys %hash4markers2keep) {
 	my $counter = 0;
-	foreach my $markers (sort {$a <=> $b} keys %{ $hash4markers2keep{$contigs} }) {
+	foreach my $markers (sort keys %{ $hash4markers2keep{$contigs} }) {
 		$counter++;
 		my $string2change = $hash4markers2keep{$contigs}{$markers};
 		my $stringchanged;
@@ -2783,9 +2797,8 @@ =head1 CITATION
 		$rename{$marker2search} = $tmp;
 		for (my $i=0; $i < scalar @array; $i++) { $stringchanged .= $array[$i]."\t"; }			
 		print COOR $stringchanged."\n";
-	} print COOR "\n";
-}
-close(COOR); &time_log();	print "\n";
+} print COOR "\n"; }
+close(COOR); &time_log(); print "\n";
 
 ## Get MSA markers
 my $markers_msa_folder = $definitely_results_dirname."/MSA_markers"; mkdir $markers_msa_folder, 0755;
@@ -2887,6 +2900,7 @@ sub check_given_marker {
 	
 	my $array_Coord = $_[0];
 	my $dna_seq = $_[1];
+	my $total_length_sub = length($dna_seq);
 	
 	my @coordinates = @{ $array_Coord };
 	my $coord_P1 = $coordinates[0]; my $coord_P2 = $coordinates[1];
@@ -2914,6 +2928,14 @@ sub check_given_marker {
 		#print Dumper $array_Coord; print "ERROR! length_string_P3_P4 $length_string_P3_P4 > $window_size_VARS_max\n"; 
 		return 1;
 	}
+	my $string_P3_P4 = substr($dna_seq, $coord_P3, $length_string_P3_P4);
+	my $count_string_P3_P4 = $string_P3_P4 =~ tr/1//; ## Variable
+	
+	if ($variable_divergence) {
+		# If a minimun divergence, get the expected variable sites for the length
+		my $expected_var_sites = int($variable_divergence * $total_length_sub);
+		unless ($count_string_P3_P4 >= $expected_var_sites) { return 1; }			
+	}
 
 	# Conserved
 	my $length_string_P5_P6 = $coord_P6 - $coord_P5;
@@ -2939,10 +2961,8 @@ sub check_given_marker {
 sub check_overlapping_markers {
 
 	## Overlaps and maximizes domino markers obtained
-	my $file = $_[0];
-	my $mergeArray_file = $_[1];
-	my $contig_id; my %tmp_hash;
-	my $marker_counter_tmp = 0;
+	my $file = $_[0]; my $mergeArray_file = $_[1];
+	my $contig_id; my %tmp_hash; my $marker_counter_tmp = 0;
 	open (FILE, $file);
 	while (<FILE>) {
 		my $line = $_;
@@ -2983,7 +3003,7 @@ sub check_overlapping_markers {
 				if ($bad_counter > 3) { ## We would consider the same marker if overlapping 3 SLIDING_user!!
 					($bool,$counter,$bad_counter) = 0;
 	}}}}}
-	
+
 	my %tmp_coord;
 	foreach my $taxa (keys %tmp_hash ) {
 		foreach my $marker (keys $tmp_hash{$taxa}) {
@@ -2999,17 +3019,15 @@ sub check_overlapping_markers {
 	if ($range > 500) { 		@length = (100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1500);
 	} elsif ($range < 500) {	@length = (50, 100, 200, 300, 400, 500); }
 
-	# Debug print Dumper \%tmp_hash;
+	# Debug print Dumper \%tmp_coord;
 	my $hash_ref = DOMINO::readFASTA_hash($$mergeArray_file);
-	my $dna_seq;
-	foreach my $ref (sort keys %{$hash_ref}) { $dna_seq = $$hash_ref{$ref}; } ## there is only 1 seq
-	my @array = split(".txt", $file); my $file2return = $array[0]."_overlapped_Markers.txt"; open (OUT, ">$file2return");
+	my $dna_seq; foreach my $ref (keys %{$hash_ref}) { $dna_seq = $$hash_ref{$ref}; } ## there is only 1 seq
+	my @array = split(".txt", $file); 
+	my $file2return = $array[0]."_overlapped_Markers.txt"; open (OUT, ">$file2return");
 	foreach my $taxa (keys %tmp_coord) {		
 		foreach my $keys_markers (keys %{ $tmp_coord{$taxa} }) {		
-			my %marker_seen;
 			my @array_coordinates = @{ $tmp_coord{$taxa}{$keys_markers} };
-			my @good_ones;
-			my %hash2print; 
+			my (@good_ones, %hash2print, %marker_seen); 
 			for (my $i=0; $i < scalar @array_coordinates; $i++) {
 				my $coordinate = $array_coordinates[$i];
 				my @array_coord = split (",", $coordinate);
@@ -3018,7 +3036,7 @@ sub check_overlapping_markers {
 					my @array_coord2 = split (",", $coordinate2);
 					my @array2check = ($array_coord[0], $array_coord2[1], $array_coord2[2], $array_coord2[3], $array_coord2[4], $array_coord2[5]);
 					my $string = join(":", @array2check).":".$taxa;
-					if ( $marker_seen{$string} ) {next;}
+					if ($marker_seen{$string}) {next;}
 					my $result = &check_given_marker(\@array2check, $dna_seq);
 					if ($result ne 1) {
 						my $id;
@@ -3042,13 +3060,11 @@ sub check_overlapping_markers {
 
 sub check_DOMINO_marker {
 	
-	my $file = $_[0];
-	my $fasta_seq_ref_hash = $_[1];
-	my $dir = $_[2];
-	my $DOMINO_markers_file = $_[3];
+	my $file = $_[0];	my $fasta_seq_ref_hash = $_[1];
+	my $dir = $_[2];	my $DOMINO_markers_file = $_[3];
 	my $seq_name = $_[4];
 	my @files;
-	
+
 	## Check each group of overlapping markers
 	open (MARKERS, "$DOMINO_markers_file") or die "Could not open file $DOMINO_markers_file";
 	my $j = 1; my %hash_array;
@@ -3057,14 +3073,14 @@ sub check_DOMINO_marker {
 		if (!eof(MARKERS)) { 
 			$/ = "\/\/"; ## Telling perl where a new line starts
 			$chunk = <MARKERS>; push(@{$hash_array{$j}}, $chunk);
-		} $j++; last if eof(MARKERS);
+		} 
+		$j++; last if eof(MARKERS);
 	}
 	$/ = "\n";	
 	#print Dumper \%hash_array; my $counter_tmp=0; open (OUT, ">$file");	
-
+	
 	my $marker_number = 0;
 	foreach my $group (sort {$a <=> $b} keys %hash_array) {
-		
 		## Split chunk push into array
 		my @array_markers_tmp = @{ $hash_array{$group} };
 		my @array_markers;
@@ -3076,7 +3092,6 @@ sub check_DOMINO_marker {
 				if ($array[$j] =~ ".*:.*") {
 					push(@array_markers, $array[$j]);
 		}}}
-
 		## For each group of markers, evaluate the features
 		for (my $i=0; $i < scalar @array_markers; $i++) {
 			my @array = split(":", $array_markers[$i]);
@@ -3084,10 +3099,12 @@ sub check_DOMINO_marker {
 			my $coord_P3 = $array[2]; my $coord_P4 = $array[3];
 			my $coord_P5 = $array[4]; my $coord_P6 = $array[5];
 			my $taxa = $array[6];
+			
 			## Retrieve msa for this marker			
 			my %hash;
 			my %fasta_msa_sub = %{ $$fasta_seq_ref_hash{$seq_name} };
 			foreach my $keys (sort keys %fasta_msa_sub ) {			
+				if ($fasta_msa_sub{$keys} =~ /missing/) {next;}
 				if ($fasta_msa_sub{$keys} =~ /.*fasta/) {
 					open (FILE, $fasta_msa_sub{$keys});
 					my ($titleline, $sequence);
@@ -3107,6 +3124,7 @@ sub check_DOMINO_marker {
 					my ($seq_id, $seq) = &fetching_range_seqs($keys, $coord_P1, $coord_P6, $fasta_msa_sub{$keys});
 					$hash{$keys} = $seq;
 			}}
+			
 			## Check pairwise MSA
 			my $valueReturned = &check_marker_pairwise(\%hash);
 			if ($valueReturned == 1) { ## if it is variable for each pairwise comparison
@@ -3165,14 +3183,13 @@ sub check_marker_pairwise {
 			my $var_sites_sub = $string =~ tr/1/1/;
 			my $con_sites_sub = $string =~ tr/0/0/;
 			my $total_sub = $con_sites_sub + $var_sites_sub;			
-			if ($var_sites_sub == 0) {
-				## If does not fit the necessary divergence
-				$seen{$reference}++; $discard{$keys}++; ## avoid checking if it is not variable
-				$pairwise{$reference}{$keys} = "NO!";
+			if ($var_sites_sub == 0) { ## If does not fit the necessary divergence
+				$seen{$reference}++; $discard{$keys}++; $pairwise{$reference}{$keys} = "NO!";
+				&debugger_print($reference."\t".$keys."\t".$var_sites_sub."\t".$con_sites_sub."\t0\t".$variable_divergence);
 				next;
 			}
 			my $percentage_sub = $var_sites_sub/$total_sub;
-			#print $reference."\t".$keys."\t".$string."\t".$var_sites_sub."\t".$con_sites_sub."\t".$percentage_sub."\t".$total_sub."\n";
+			&debugger_print($reference."\t".$keys."\t".$var_sites_sub."\t".$con_sites_sub."\t".$percentage_sub."\t".$variable_divergence);
 			if ($variable_positions_user_min) {
 				if ($var_sites_sub > $variable_positions_user_min) { 
 					$pairwise{$reference}{$keys} = "YES!";
@@ -3199,17 +3216,22 @@ sub check_marker_pairwise {
 			if ($pairwise{$keys}{$k} eq 'YES!') {
 				$flag_fitting++;
 	}}}
-	
+
+	#print Dumper \%pairwise;
 	if ($number_sp == 2) {
 		if ($flag_fitting == 1) {
-			return '1'; #print "YES!\n";
+			#print "YES!\n"; 
+			return '1'; 
 		} else {
-			return '0'; #print "NO!\n";
+			#print "NO!\n";
+			return '0'; 
 	}} else {
 		if ($flag_fitting < $minimum_number_taxa_covered) {
-			return '0'; #print "NO!\n";
+			#print "NO!\n";
+			return '0'; 
 		} else {
-			return '1'; #print "YES!\n"; #print Dumper \%pairwise;
+			#print "YES!\n"; 
+			return '1'; 
 	}}	
 }
 
@@ -3258,7 +3280,6 @@ sub check_marker_ALL {
 
 	my @tmp_length = sort @length_seqs;
 	my @tmp_length_uniq = uniq(@tmp_length);	
-	
 	if (scalar @tmp_length_uniq > 1) {
 		&printError("There is problem: length of the markers do not match for $file..."); return "";
 	} else { $length_seqs[0] = $length; }	
@@ -3653,7 +3674,7 @@ sub get_coordinates_each_taxa {
 		my $expected_var_sites;	my $flag_error=0;
 		if ($variable_divergence) {
 			# If a minimun divergence, get the expected variable sites for the length
-			$expected_var_sites = ($variable_divergence * $total_length)*100;
+			$expected_var_sites = int($variable_divergence * $total_length);
 			unless ($count_string_P3_P4 >= $expected_var_sites) { $flag_error++; }			
 		} else {
 			# User provides a number of variable positions or a range
@@ -4434,7 +4455,8 @@ sub print_Excel {
 		$worksheet_markers->write($row_markers, $col_markers, "% Variable positions", $format_bold);  $row_markers++;
 	
 		for (my $i = 0; $i < scalar @array_markers; $i++) {
-			if ($array_markers[$i] eq "undef") { $row_markers++; next;}
+			if ($array_markers[$i] eq "") { $row_markers++; next;}
+			
 			my @split = split("\t", $array_markers[$i]);
 			$worksheet_markers->write($row_markers, $first_col, $split[0], $format); # Contig name
 			my $position = $first_col + 1;
@@ -4896,10 +4918,14 @@ sub sliding_window_conserve_variable {
 				my $count_string_P1_P2 = $string_P1_P2 =~ tr/1//; ## Conserved 
 				my $count_string_P3_P4 = $string_P3_P4 =~ tr/1//; ## Variable
 				my $count_string_P5_P6 = $string_P5_P6  =~ tr/1//; ## Conserved
+				my $count_N_P1_P2 = $string_P1_P2 =~ tr/N//; ## Conserved 
+				my $count_N_P5_P6 = $string_P5_P6  =~ tr/N//; ## Conserved
 				
 				## More than variations allowed in conserved
 				if ($count_string_P1_P2 > $window_var_CONS) { next; } 
 				if ($count_string_P5_P6 > $window_var_CONS) { next; }
+				if ($count_N_P1_P2 > $window_var_CONS) { next; } 
+				if ($count_N_P5_P6 > $window_var_CONS) { next; } 
 
 				#Get marker profile					
 				my $total_length = $coord_P6 - $coord_P1;
@@ -4909,7 +4935,7 @@ sub sliding_window_conserve_variable {
 				my $expected_var_sites;	my $flag_error=0;
 				if ($variable_divergence) {
 					# If a minimun divergence, get the expected variable sites for the length
-					$expected_var_sites = ($variable_divergence * $total_length)*100;
+					$expected_var_sites = int($variable_divergence * $total_length);
 					unless ($count_string_P3_P4 >= $expected_var_sites) { $flag_error++; }			
 				} else {
 					# User provides a number of variable positions or a range
@@ -4930,28 +4956,27 @@ sub sliding_window_conserve_variable {
 				if ($missing_count_percent < $percent_total_length) {
 					print OUTPUT "$$id\t$coord_P1:$coord_P2\t$coord_P3:$coord_P4\t$coord_P5:$coord_P6\n";
 					#if ($debugger) {
-					#	print "\n\n***********************************************\n";
-					#	print "Marker: $coord_P1:$coord_P2 $coord_P3:$coord_P4 $coord_P5:$coord_P6\n";
-					#	print "Contig: $$id\nMax: $seqlen\nVL: $j\nCL: $h\n\nPositions:\n";
-					#	print "P1:$coord_P1\nP2:$coord_P2\nP3:$coord_P3\nP4:$coord_P5\nP6: $coord_P6\n";
-					#	print "\nSubsets\n";
-					#	print "Conserved (P1-P2): ".length($string_P1_P2)."\n"; print "$string_P1_P2\n";
-					#	print "Variable (P3-P4): ".length($string_P3_P4)."\n";  print $string_P3_P4."\n";
-					#	print "Conserved (P5-P6): ".length($string_P5_P6)."\n"; print $string_P5_P6."\n";
-					#	print "\nVariations:\n";
-					#	print "Count_string_P1_P2: $count_string_P1_P2\n";
-					#	print "Count_string_P3_P4: $count_string_P3_P4\n";
-					#	print "Count_string_P5_P6: $count_string_P5_P6\n";
-					#	print "\nMarker:\n";
-					#	print "Total Length:$total_length\tVariable Positions:$count_string_P3_P4\tExpected:$expected_var_sites\n";
-					#	print "Missing allowed: $percent_total_length %\tMissing:$missing_count %\n";
-					#	print "***********************************************\n";
+						#print "\n\n***********************************************\n";
+						#print "Marker: $coord_P1:$coord_P2 $coord_P3:$coord_P4 $coord_P5:$coord_P6\n";
+						#print "Contig: $$id\nMax: $seqlen\nVL: $j\nCL: $h\n\nPositions:\n";
+						#print "P1:$coord_P1\nP2:$coord_P2\nP3:$coord_P3\nP4:$coord_P5\nP6: $coord_P6\n";
+						#print "\nSubsets\n";
+						#print "Conserved (P1-P2): ".length($string_P1_P2)."\n"; print "$string_P1_P2\n";
+						#print "Variable (P3-P4): ".length($string_P3_P4)."\n";  print $string_P3_P4."\n";
+						#print "Conserved (P5-P6): ".length($string_P5_P6)."\n"; print $string_P5_P6."\n";
+						#print "\nVariations:\n";
+						#print "Count_string_P1_P2: $count_string_P1_P2\n";
+						#print "Count_string_P3_P4: $count_string_P3_P4\n";
+						#print "Count_string_P5_P6: $count_string_P5_P6\n";
+						#print "\nMarker:\n";
+						#print "Total Length:$total_length\tVariable Positions:$count_string_P3_P4\tExpected:$expected_var_sites\n";
+						#print "Missing allowed: $percent_total_length %\tMissing:$missing_count_percent %\n";
+						#print "***********************************************\n";
 					#}
 				} else { next;	
 	}}}}
 	close(OUTPUT);
-	if (-e -r -s $output_file_coordinates) { return \$output_file_coordinates;	
-	} else { return 0; }	
+	return $output_file_coordinates;	
 }
 
 sub time_log {