Merging from upstream

quadram-institute-bioscience · Mar 17, 2019 · dcbe3a7 · dcbe3a7
2 parents ba552a8 + 449d007
commit dcbe3a7
Show file tree

Hide file tree

Showing 13 changed files with 246 additions and 244 deletions.
diff --git a/README.md b/README.md
@@ -160,13 +160,15 @@ __window_size__: The size of the sliding window in bases. If you set this too hi
 
 __annotation.embl__: This file contains a modified version of the input annotation file in EMBL format. For example, it adds 5' and 3' features to each gene. It can be visualised in Artemis.
 
-__gene_report.csv__: This is the main results file in the output of the script. It contains a tab delimited spreadsheet detailing genes identified as being interesting, in the below format. The first column is the gene name derived from the input annotation file. If a signal is identified in an intergenic region, the start and end coordinates are given. The next column is a categorisation of the mechanism, such as up or down regulation, knockout, unclassified etc.... The 3rd and 4th columns are the coordinates of the start and end of the gene, relative to the input annotation file. The MaxLogFC is the maximum log fold change in the signal observed in the gene (or in the 5'/3'). It is rounded to the nearest integer. The expression column indicates if the gene is experiencing an increase or decrease in insertions. The direction column indicates which direction the significant insertions where primarily detected in (or no direction if both apply). Finally the last column gives the upstream gene, which is often implicated in the mechanism.
+__gene_report.csv__: This is the main results file in the output of the script. It contains a tab delimited spreadsheet detailing genes identified as being interesting, in the below format. The first column is the gene name derived from the input annotation file. If a signal is identified in an intergenic region, the start and end coordinates are given. The next column is a categorisation of the mechanism, such as up or down regulation, knockout, unclassified etc.... The 3rd and 4th columns are the coordinates of the start and end of the gene, relative to the input annotation file. The MaxLogFC is the maximum log fold change in the signal observed in the gene (or in the 5'/3'). It is rounded to the nearest integer.  If the signal is strongest in a single direction, the max log fold change in that direction is reported rather than the value for the combined analysis. The expression column indicates if the gene is experiencing an increase or decrease in insertions. The direction column indicates which direction the significant insertions where primarily detected in (or no direction if both apply). Finally the last column gives the upstream gene, which is often implicated in the mechanism.
 
 |  Gene | Category | Start | End | MaxLogFC | Expression | Direction | Upstream |
 |  --- | --- | --- | --- | --- | --- | --- | --- |
 |  zabC | downregulated | 100 | 500 | 1 | increased_insertions | forward | abc |
 |  yxxY | upregulated | 135 | 234 | 1 | decreased_insertions | reverse | efg |
 
+__regulated_gene_report.csv__: This is a filtered version of the gene_report.csv file but only includes genes which are identified as upregulated or downregulated. If no genes are identified with this pattern, the file will not be created.  This is really only useful if the experiements included a promotor.
+
 __combined.csv__: This comma delimited spreadsheet is the output of the Bio-TraDIS toolkit, with additional essentiality categoristations, and an example is listed below. This is the raw data from which the gene_report.csv is derived. It lists each gene or sliding window, and optionally the corresponding 5' and 3' features for a gene. The first 2 columns list the names of the gene or give the coordinates of the sliding window. The 3rd column lists the annotated function of the gene (if available in the annotation file). The numerical columns are derived from EdgeR. The 4th column gives the log fold change between the conditions and the controls. The 5th column gives the log counts per million, which can be thought of as relative abundance. The final column indicate how the essentiality has changed between the conditions and the controls, so a gene can always be non-essential in both the controls and the conditions or essential in all cases. More interestingly though is where there is a change in essentiality between the control and the conditions, indicating a large mechanistic change. 
 
 | locus_tag | gene_name | function | logFC | logCPM | PValue | q.value | Essentiality |

diff --git a/albatradis/BlockIdentifier.py b/albatradis/BlockIdentifier.py
@@ -11,14 +11,14 @@ def __init__(self, combined_mask_file, forward_mask_file, reverse_mask_file, win
 		self.window_size = window_size
 		self.logfc_direction_change = 1
 
-	def increased_insertions_blocks(self, masking_plot):
+	def increased_insertions_blocks(self, masking_plot_combined):
 		blocks = []
 		inblock = False		
 		start = 0
 		end = 0
 		max_logfc = 0 
 
-		for i, mask in enumerate(masking_plot.combined):
+		for i, mask in enumerate(masking_plot_combined):
 			if mask > 0 and not inblock:
 				inblock = True
 				start = i
@@ -34,17 +34,17 @@ def increased_insertions_blocks(self, masking_plot):
 
 		# Check for block at end
 		if inblock:
-			blocks.append(Block(start +1, len(masking_plot.combined), len(masking_plot.combined)-start, max_logfc, 'increased_insertions'))
+			blocks.append(Block(start +1, len(masking_plot_combined), len(masking_plot_combined)-start, max_logfc, 'increased_insertions'))
 		return blocks
 
-	def decreased_insertions_blocks(self, masking_plot):
+	def decreased_insertions_blocks(self, masking_plot_combined):
 		blocks = []
 		inblock = False		
 		start = 0
 		end = 0
 		max_logfc = 0
 
-		for i, mask in enumerate(masking_plot.combined):
+		for i, mask in enumerate(masking_plot_combined):
 			if mask < 0 and not inblock:
 				inblock = True
 				start = i
@@ -60,51 +60,61 @@ def decreased_insertions_blocks(self, masking_plot):
 
 		# Check for block at end
 		if inblock:
-			blocks.append(Block(start +1, len(masking_plot.combined), len(masking_plot.combined)-start, max_logfc, 'decreased_insertions'))
+			blocks.append(Block(start +1, len(masking_plot_combined), len(masking_plot_combined)-start, max_logfc, 'decreased_insertions'))
 
 		return blocks
 
-	def direction_for_block(self, block, forward_masking_plot, reverse_masking_plot):
-		forward_max_logfc = 0 
-		for i in range(block.start -1, block.end):
-			if numpy.absolute(forward_masking_plot.combined[i]) > forward_max_logfc:
-				forward_max_logfc = numpy.absolute(forward_masking_plot.combined[i])
-
-		reverse_max_logfc = 0 
-		for i in range(block.start -1, block.end):
-			if numpy.absolute(reverse_masking_plot.combined[i]) > reverse_max_logfc:
-				reverse_max_logfc = numpy.absolute(reverse_masking_plot.combined[i])
+	def peak_from_array(self, block_values):
+		abs_max_value = max(numpy.absolute(block_values))
+		if max(block_values) != abs_max_value:
+			abs_max_value *= -1
+		return abs_max_value
 
-		if forward_max_logfc > reverse_max_logfc:
-			if reverse_max_logfc == 0:
-				return 'forward'
-			elif forward_max_logfc >= reverse_max_logfc + self.logfc_direction_change:
+	# check if the reads are going in 1 particular direction, if so then use the max_logfc for that direction.
+	def direction_for_block(self, block, forward_masking_plot, reverse_masking_plot):
+		forward_max_logfc = self.peak_from_array(forward_masking_plot.combined[block.start-1:block.end])
+		reverse_max_logfc = self.peak_from_array(reverse_masking_plot.combined[block.start-1:block.end])
+
+		if numpy.absolute(forward_max_logfc) > numpy.absolute(reverse_max_logfc):
+			if reverse_max_logfc == 0 or forward_max_logfc >= reverse_max_logfc + self.logfc_direction_change:
+				block.max_logfc = forward_max_logfc
 				return 'forward'
 			else:
 				return 'nodirection'
 		else:
-			if forward_max_logfc == 0:
-				return 'reverse'
-			elif reverse_max_logfc >= forward_max_logfc + self.logfc_direction_change:
+			if forward_max_logfc == 0 or reverse_max_logfc >= forward_max_logfc + self.logfc_direction_change:
+				block.max_logfc = reverse_max_logfc
 				return 'reverse'
 			else:
 				return 'nodirection'
+
+	def merge_all_plots_choosing_peak_logfc(self, combined_plot, forward_plot, reverse_plot):
+		genome_length = len(combined_plot.combined)
+		masking_plot_combined = numpy.zeros(genome_length, dtype=int)
+		for i in range(0, genome_length):
+			peak_value_abs = max(numpy.absolute([combined_plot.combined[i], forward_plot.combined[i], reverse_plot.combined[i]]))
+			if peak_value_abs == max([combined_plot.combined[i], forward_plot.combined[i], reverse_plot.combined[i]]):
+				# its positive
+				masking_plot_combined[i] = peak_value_abs
+			else:
+				# its negative
+				masking_plot_combined[i] = peak_value_abs*-1
+
+		return masking_plot_combined
 
 	def block_generator(self):
-		masking_plot = PlotParser(self.combined_mask_file)
+		combined_plot = PlotParser(self.combined_mask_file)
+		forward_masking_plot = PlotParser(self.forward_mask_file)
+		reverse_masking_plot = PlotParser(self.reverse_mask_file)
+
+		masking_plot = self.merge_all_plots_choosing_peak_logfc(combined_plot, forward_masking_plot, reverse_masking_plot)
 		blocks = self.increased_insertions_blocks(masking_plot) + self.decreased_insertions_blocks(masking_plot)
 
 		'''Filter out blocks which are less than the window size'''
 		filtered_blocks = [b for b in blocks if b.block_length >= self.window_size]
-
-		forward_masking_plot = PlotParser(self.forward_mask_file)
-		reverse_masking_plot = PlotParser(self.reverse_mask_file)
+
 		for b in filtered_blocks:
 			b.direction = self.direction_for_block(b, forward_masking_plot, reverse_masking_plot)
 
 		return filtered_blocks
 
-# is it beside an essential region
-# number of reads in block
-# number of insertions
-# average reads per insertion
diff --git a/albatradis/BlockInsertions.py b/albatradis/BlockInsertions.py
@@ -64,6 +64,10 @@ def __init__(self, logger,plotfiles, minimum_threshold, window_size, window_inte
 
 		if not os.path.exists(self.prefix ):
 			os.makedirs(self.prefix )
+
+
+		if self.use_annotation:
+			self.span_gaps = 0
 
 	def run(self):
 		plotfile_objects = self.prepare_input_files()
@@ -162,24 +166,38 @@ def gene_statistics(self,forward_plotfile, reverse_plotfile, combined_plotfile,
 		if len(genes) == 0:
 			return []
 
+		self.write_gene_report(genes, intergenic_blocks)
+		self.write_regulated_gene_report(genes, intergenic_blocks)
+
+		if self.verbose:
+			self.print_genes_intergenic(genes,intergenic_blocks)
+
+		return genes
+
+	def write_gene_report(self, genes, intergenic_blocks):
 		block_filename = os.path.join(self.prefix, "gene_report.csv")
 		with open(block_filename, 'w') as bf:
 			bf.write(str(genes[0].header())+"\n")
 			for i in genes:
 				bf.write(str(i)+"\n")
-
 			for b in intergenic_blocks:
 				bf.write(str(b)+"\n")
 
-		if self.verbose:
-			print(genes[0].header())		
-			for i in genes:
-				print(i)
+	def write_regulated_gene_report(self, genes, intergenic_blocks):	
+		regulated_genes = [g for g in genes if g.category() == 'upregulated' or g.category() == 'downregulated']
+		if len(regulated_genes) > 0:
+			block_filename = os.path.join(self.prefix, "regulated_gene_report.csv")
+			with open(block_filename, 'w') as bf:
+				bf.write(str(regulated_genes[0].header())+"\n")
+				for i in regulated_genes:
+					bf.write(str(i)+"\n")
 
-			for b in intergenic_blocks:
-				print(b)
-
-		return genes
+	def print_genes_intergenic_blocks(self, genes, intergenic_blocks):
+		print(genes[0].header())		
+		for i in genes:
+			print(i)
+		for b in intergenic_blocks:
+			print(b)
 
 	def mask_plots(self):
 		pm = PlotMasking(self.plotfiles, self.combined_plotfile, self.strict_signal )

diff --git a/albatradis/Gene.py b/albatradis/Gene.py
@@ -35,9 +35,42 @@ def upstream_gene(self):
 	def max_logfc_from_blocks(self):
 		if self.blocks:
 			all_logfc = [b.max_logfc for b in self.blocks]
-			return numpy.max(numpy.absolute(all_logfc))
+			highest_logfc = numpy.max(numpy.absolute(all_logfc))
+			for a in all_logfc:
+				if a == highest_logfc:
+					return highest_logfc
+				elif a == highest_logfc*-1:
+					return highest_logfc*-1
 		else:
 			return 0
+
+
+	def max_logfc_from_category(self):
+		l = self.max_logfc_from_blocks()
+		if self.category() == 'upregulated':
+			if l < 0:
+				return l*-1
+			else:
+				return l
+		elif self.category() == 'downregulated':
+			if l > 0:
+				return l*-1
+			else:
+				return l
+
+		if self.expression_from_blocks() == 'increased_insertions':
+			if l < 0:
+				return l*-1
+			else:
+				return l
+		elif self.expression_from_blocks() == 'decreased_insertions':
+			if l > 0:
+				return l*-1
+			else:
+				return l
+
+		return l
+
 
 	def expression_from_blocks(self):
 		if self.blocks:
@@ -52,7 +85,7 @@ def direction_from_blocks(self):
 			return ""
 
 	def __str__(self):
-		return "\t".join([str(self.gene_name), str(self.category()), str(self.feature.location.start), str(self.feature.location.end), str(self.max_logfc_from_blocks()),  str(self.expression_from_blocks()), str(self.direction_from_blocks()), str(self.upstream_gene())] )
+		return "\t".join([str(self.gene_name), str(self.category()), str(self.feature.location.start), str(self.feature.location.end), str(self.max_logfc_from_category()),  str(self.expression_from_blocks()), str(self.direction_from_blocks()), str(self.upstream_gene())] )
 
 	def header(self):
 		return "\t".join(['Gene', 'Category', 'Start', 'End', 'MaxLogFC', 'Expression', 'Direction', 'Upstream'])

diff --git a/albatradis/GeneAnnotator.py b/albatradis/GeneAnnotator.py
@@ -46,13 +46,13 @@ def annotate_genes(self):
 			if len(g.categories) == 0:
 				p = self.proportion_blocks_overlap_with_gene(f, overlapping_blocks)
 				if p > 0.9:
-					g.categories.append('over_90_perc_inactivation')
+					g.categories.append('knockout')
 				elif p > 0.8:
-					g.categories.append('over_80_perc_inactivation')	
+					g.categories.append('knockout')	
 				elif p > 0.7:
-					g.categories.append('over_70_perc_inactivation')
+					g.categories.append('knockout')
 				elif p > 0.6:
-					g.categories.append('over_60_perc_inactivation')
+					g.categories.append('knockout')
 				elif p > 0.5:
 					g.categories.append('over_50_perc_inactivation')	
 
@@ -67,7 +67,7 @@ def annotate_genes(self):
 			block.intergenic = True
 
 		reannotate_with_5_3_prime = self.reannotate_5_3_prime(genes)
-
+			
 		return reannotate_with_5_3_prime 
 
 	def feature_to_gene_name(self, feature):
@@ -204,8 +204,7 @@ def reannotate_5_3_prime(self,genes):
 				if found_gene_name not in name_to_genes:
 					filtered_names_to_genes[found_gene_name] = Gene(self.embl_reader.genes_to_features[found_gene_name], [])
 					filtered_names_to_genes[found_gene_name].blocks = gene.blocks
-
-
+
 					regulation_category = self.regulation(filtered_names_to_genes[found_gene_name].feature.strand,prime_end, directions)
 					if regulation_category:
 						filtered_names_to_genes[found_gene_name].categories.append(regulation_category)
@@ -218,21 +217,12 @@ def reannotate_5_3_prime(self,genes):
 					if regulation_category:
 						filtered_names_to_genes[found_gene_name].categories.append(regulation_category)
 
-
 		# carry over non prime genes
 		for name, gene in name_to_genes.items():
 			res = re.search("^(.+)__[35]prime$", name)
 			if not res:
 				if name not in filtered_names_to_genes:
 					filtered_names_to_genes[name] = gene
-
-		# If theres decreased insertions
-		#for name, gene in filtered_names_to_genes.items():
-		#	increased = [True for g in gene.blocks if g.expression == 'increased_insertions']
-		#	for b in filtered_names_to_genes[name].blocks:
-		#		if b.max_logfc < 0 and True in increased:
-		#			b.max_logfc *= -1
-		#		elif b.max_logfc > 0 and True not in increased:
-		#			b.max_logfc *= -1
 
 		return [g for g in filtered_names_to_genes.values()]
+
diff --git a/albatradis/GeneReport.py b/albatradis/GeneReport.py
@@ -13,9 +13,26 @@ def read_all_data_csv(self):
 			reader = csv.reader(csvfile, delimiter='	')
 			all_data =  [r for r in reader if r[0] != 'Gene']
 		return all_data
+
+	def fix_sign_on_logfc(self,gene_all_data):
+		for r in gene_all_data:
+			r[4] = int(r[4])
+			if r[1] == 'upregulated':
+				if r[4] < 0:
+					r[4] *= -1
+			elif r[1] == 'downregulated':
+				if r[4] > 0:
+					r[4] *= -1
+			elif r[5] == 'increased_insertions':
+				if r[4] < 0:
+					r[4] *= -1
+			elif r[5] == 'decreased_insertions':
+				if r[4] > 0:
+					r[4] *= -1
+		return gene_all_data
 
 	def read_csv(self, gene_all_data):
-		return {r[0]: r[4] for r in gene_all_data}
+		return {r[0]: r[4] for r in self.fix_sign_on_logfc(gene_all_data)}
 
 	def genes_to_logfc(self, gene_names):
 		row = []

diff --git a/albatradis/HeatMap.py b/albatradis/HeatMap.py
@@ -34,7 +34,7 @@ def clean_filenames(self):
 	def create_heat_map(self):
 		int_values = [[int(y) for y in x] for x in self.reports_to_gene_logfc.values()]
 
-		dims = (16.54, 23.4)
+		dims = (8.3, 11.7)
 		fig, ax = plt.subplots(figsize = dims)
 		data = pd.DataFrame(int_values, columns = self.gene_names, index=self.clean_filenames())
 

diff --git a/albatradis/PlotLog.py b/albatradis/PlotLog.py
@@ -80,12 +80,14 @@ def blocks_create(self, logfc_values):
 	def genome_wide_logfc(self,logfc_coord_values):
 		logfc_to_bases = numpy.zeros(self.genome_length, dtype = int)
 
+		# start with the largest signals and overwrite with smaller signals.
+		# prevents issues with overlapping blocks/genes
+		sorted_logfc_coord_values = sorted(logfc_coord_values, key = lambda l: (numpy.absolute(l.logfc_value)))
 		for l in logfc_coord_values:
 			abs_logfc_value = numpy.absolute(l.logfc_value)
 
 			for i in range(l.start -1, l.end):
-				if logfc_to_bases[i] < abs_logfc_value:
-					logfc_to_bases[i] = l.logfc_value
+				logfc_to_bases[i] = l.logfc_value
 
 		return self.span_block_gaps(self.filter_out_small_blocks(logfc_to_bases))
 
@@ -126,6 +128,7 @@ def read_comparison_file(self):
 
 		genes_to_features = EMBLReader(self.embl_file).genes_to_features
 
+		genes_seen = {}
 		with open(self.comparison_filename, newline='') as csvfile:
 			comparison_reader = csv.reader(csvfile, delimiter=',')
 			for row in comparison_reader:
@@ -161,10 +164,18 @@ def read_comparison_file(self):
 				if gene_name in genes_to_features:
 					start = genes_to_features[gene_name].location.start +1
 					end = genes_to_features[gene_name].location.end
-
-					#print(gene_name+ "\t"+str(start)+ "\t"+str(end)+"\t", logfc)
 					logfc_coord_values.append(LogFC(int(start), int(end), logfc))
+					# A gene should only be identified once
+					genes_seen[gene_name] = 1
 				else:
 					print("Couldnt find gene coordinates for "+ str(gene_name))
+
+			# loop over all the remaining gene and give them a value of zero
+			for gene_name,feature in genes_to_features.items():
+				if gene_name not in genes_seen:
+					start = feature.location.start +1
+					end = feature.location.end
+					logfc_coord_values.append(LogFC(int(start), int(end), 0))
+
 		return logfc_coord_values