compatibility to gene id without species id

zhangrengang · Dec 4, 2024 · b7161dd · b7161dd
1 parent 900f2e4
commit b7161dd
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 18 deletions.
diff --git a/SOI-tools.md b/SOI-tools.md
@@ -3,7 +3,7 @@
 Before run the pipeline:
 1. install the lasest verion of [SOI](https://github.com/zhangrengang/orthoindex#installation) by `python3 setup.py install`, 
 and have a check: `which soi-syn`;
-2. completed the [example pipeline](https://github.com/zhangrengang/evolution_example) to get the orthologous synteny file `collinearity.ortho`;
+2. complete the [example pipeline](https://github.com/zhangrengang/evolution_example) to get the orthologous synteny file `collinearity.ortho`;
 3. prepare file `species.config` to set the expected subgenome numbers for the targeted species (TAB seperated):
 ```
 Vitis_vinifera  1

diff --git a/soi/OrthoFinder.py b/soi/OrthoFinder.py
@@ -1142,15 +1142,17 @@ def get_homologs(self, sps=None, sp1=None, sp2=None, byName=True, **kargs):
 				Genes_1 = list(map(gene_format_o, Genes_1))
 				Genes_2 = list(map(gene_format_o, Genes_2))
 				for (sp1, g1), (sp2, g2) in itertools.product(Genes_1, Genes_2):
-					assert sp1 != sp2
+					if sp1 is not None and sp2 is not None:
+						assert sp1 != sp2
 					if (g2, g1) in ortho_pairs:
 						continue
 					ortho_pairs.add( (g1, g2) )	#orthologs
 				for Genes in [Genes_1, Genes_2]:
 					if len(Genes) < 2:
 						continue
 					for (sp1, g1), (sp2, g2) in itertools.combinations(Genes, 2):
-						assert sp1 == sp2
+						if sp1 is not None and sp2 is not None:
+							assert sp1 == sp2
 						if sps is not None and sp1 not in sps:
 							continue
 						if (g2, g1) in ortho_pairs:
@@ -1840,7 +1842,8 @@ def get_oglogs(OFdir, outHomo):
 		line = [g1, g2]
 		print('\t'.join(line), file=outHomo)
 def gene_format_o(gene):
-	sp, g = gene.split('|')
+	try: sp, g = gene.split('|', 1)
+	except ValueError: sp, g = None, gene
 	return (sp, gene)
 def get_paralogs(OFdir, outHomo, min_support=0.5):
 	result = OrthoFinder(OFdir)
@@ -1867,13 +1870,13 @@ def get_paralogs(OFdir, outHomo, min_support=0.5):
 				line = [g1, g2, info]
 				print('\t'.join(line), file=outHomo)
 def gene_format_p(gene):
-	sp, g = gene.split('|')
+	sp, g = gene.split('|', 1)
 	sp = sp[:int(len(sp)/2)]
 	g = sp + '|' + g
 	return (sp, g)
 def gene_format_common(gene):
 	if not '|' in gene:
-		sp, g = gene, gene
+		sp, g = None, gene
 		return (sp, g)
 	sp, g = gene.split('|')
 	sp1 = sp[:len(sp)/2]

diff --git a/soi/WGDI.py b/soi/WGDI.py
@@ -137,6 +137,9 @@ def plot_dotplot(self, ax=None, d_offset={}, gff=None, align='center', xy=1,  ax
 		has_lab = False
 		texts = []
 		for sgement in self.lazy_lines(gene_axis, gff=gff):
+		#	print(sgement, d_offset)
+			if sgement.chrom not in d_offset:
+				continue
 			offset = d_offset[sgement.chrom] + sgement.start
 			lab = sgement.label if label else None
 		#	print(xy, len(sgement), width, offset, sgement.color, align, lab, gene_axis, self.is_order(), sgement.start, sgement.end)
@@ -205,9 +208,11 @@ def __init__(self, line):
 		except IndexError: self.label = None
 	def __len__(self):
 		return self.end - self.start + 1
-	@property
+	@lazyproperty
 	def id(self):
 		return '{}:{}-{}|{}'.format(self.chrom, self.start, self.end, self.subgenome)
+	def __repr__(self):
+		return self.id
 	def write(self, fout):
 		print('\t'.join(map(str, (self.chrom, self.start, self.end, self.color, self.subgenome))), file=fout)
 def stats_besthits(alignment, pol_indice, dip_indice, labels, blasts):

diff --git a/soi/__version__.py b/soi/__version__.py
@@ -1 +1 @@
-version = '1.1'
+version = '1.1.2'
diff --git a/soi/dot_plotter.py b/soi/dot_plotter.py
@@ -75,6 +75,7 @@ def dotplot_args(parser):
 	group_ks.add_argument('--lower-ks', metavar='Ks', type=float, default=None, help="lower limit of median Ks. [default=%(default)s]")
 	group_ks.add_argument('--upper-ks', metavar='Ks', type=float, default=None, help="upper limit of median Ks. [default=%(default)s]")
 	group_ks.add_argument('--output-hist', action='store_true', default=False, help="output the data for histogram plot. [default=%(default)s]")
+	group_ks.add_argument('--cbar', action='store_true', default=False, help="plot color bar when no histogram plot. [default=%(default)s]")
 #	group_ks.add_argument('--clip-ks', action='store_true', default=None, help="clip ks > max-ks. [default=%(default)s]")
 #	group_ks.add_argument('--hist-ylim', type=float, default=None, help="max y axis of Ks histgram. [default=%(default)s]")
 #	group_ks.add_argument('--yn00', action='store_true', default=False, help='turn to YN00[default=%(default)s]')
@@ -178,7 +179,7 @@ def main(args):
 		plot_blocks(blocks, outplots, ks = None, max_ks=None, ks_hist=None,
 			xlabels=chrs1, ylabels=chrs2,
 			xpositions=xpositions, ypositions=ypositions,
-			xlines=lines1, ylines=lines2,
+			xelines=lines1, yelines=lines2,
 			xlim=max(lines1), ylim=max(lines2))
 	ploidy_data = coord_path1, coord_path2, coord_graph1, coord_graph2 = parse_gff(gff, chrs1, chrs2)
 	outplots = [prefix + '.' + fmt for fmt in args.format]
@@ -190,7 +191,7 @@ def main(args):
 			xlabels=chrs1, ylabels=chrs2, same_sp=same_sp,
 			#hist_ylim=args.hist_ylim,
 			xpositions=xpositions, ypositions=ypositions,
-			xelines=lines1, yelines=lines2,
+			xelines=lines1, yelines=lines2,	# chromosome ends
 			xclines=xclines, yclines=yclines, # such as centromere
 			xlim=max(lines1), ylim=max(lines2),
 			xoffset=d_offset1, yoffset=d_offset2, gff=gff, 
@@ -207,7 +208,7 @@ def plot_blocks(blocks, outplots, ks=None, max_ks=None, ks_hist=False, ks_cmap=N
 			xlabels=None, ylabels=None, xpositions=None, ypositions=None, xelines=None, yelines=None, xlim=None, ylim=None,
 			figsize=18, fontsize=10, point_size=0.8, xclines=None, yclines=None, plot_bin=None, output_hist=False,
 			xoffset=None, yoffset=None, xbars=None, ybars=None, gff=None, gene_axis=None, xbarlab=True, ybarlab=True, 
-			hist_ylim=None, xlabel=None, ylabel=None, remove_prefix=True, number_plots=True, same_sp=False,
+			hist_ylim=None, xlabel=None, ylabel=None, remove_prefix=True, number_plots=True, same_sp=False, cbar=False,
 			ploidy=False, ploidy_data=None, ortholog_graph=None, of_color=False, homology=False, **kargs
 			):
 	import matplotlib
@@ -366,11 +367,11 @@ def plot_blocks(blocks, outplots, ks=None, max_ks=None, ks_hist=False, ks_cmap=N
 		plot_label(ax, label, fontsize=lsize)
 
 	tlabel = 'OrthoIndex' if of_color else 'Ks'
-	if not ks is None and ks_hist is None and False:	# color map only
+	if not ks is None and ks_hist is None and cbar:	# color map only
 		ax = plt.subplot2grid((21,20),(20,0), colspan=5)
 		plt.axis('off')
-		cbar = plt.colorbar(ax=ax, orientation='horizontal', location='bottom', label=label, shrink=1, fraction=0.5)
-		# cbar.ax.set_xlabel('Ks', fontsize=14)
+		cbar = plt.colorbar(ax=ax, orientation='horizontal', location='bottom', label=tlabel, shrink=1, fraction=0.5)
+		#cbar.ax.set_xlabel(tlabel, fontsize=14)
 
 
 	# ax2

diff --git a/soi/mcscan.py b/soi/mcscan.py
@@ -35,7 +35,7 @@ def __init__(self, info):
 		self.coord = (chr, start, end)
 		try: self.species, self.raw_gene = id.split('|', 1)
 		except ValueError: 
-			logger.warn('Species id not found in `{}`, ignoring'.fomart(id))
+#			logger.warn('Species id not found in `{}`, ignoring'.format(id))
 			self.species, self.raw_gene = None, id
 	def __str__(self):
 		return self.id
@@ -1091,7 +1091,7 @@ def _parse(self):
 		self.Gene = g
 		self.id = gene
 		try: self.species, self.raw_gene = gene.split('|', 1)
-		except ValueError: pass
+		except ValueError: self.species, self.raw_gene = None, gene
 	def __hash__(self):
 		return hash(self.id)
 	def __str__(self):
@@ -2942,17 +2942,25 @@ def test():
 def list_blocks(collinearity, outTsv, gff=None, kaks=None):
 	'''以共线性块为单位，输出信息'''
 	line = ["Alignment", "species1", "species2", "chr1", "chr2", "start1", "end1", "length1", "start2", "end2", "length2", "strand", "N_gene", "mean_Ks", 'median_Ks', 'score', 'e_value']
-	line = ["id", "species1", "species2", "chr1", "chr2", "start1", "end1", "length1", "start2", "end2", "length2", "strand", "length", "ks_average", 'ks_median', 'score', 'e_value']
+	line = ["id", "species1", "species2", "chr1", "chr2", "strand", 
+				"start1", "end1", "istart1", "iend1", 
+				"start2", "end2", "istart2", "iend2",
+				"length1", "length2", "N_gene", "ks_average", 'ks_median', 'score', 'e_value']
 	print('\t'.join(line), file=outTsv)
 	for rc in Collinearity(collinearity,gff=gff,kaks=kaks):
 		sp1, sp2 = rc.species
 		chr1, chr2 = rc.chrs
 		Alignment, score, e_value, N, strand = rc.Alignment, rc.score, rc.e_value, rc.N, rc.strand
 		start1, end1, length1 = rc.start1, rc.end1, rc.length1
 		start2, end2, length2 = rc.start2, rc.end2, rc.length2
+		istart1, iend1 = rc.istart1, rc.iend1
+		istart2, iend2 = rc.istart2, rc.iend2
 		mean_ks = rc.mean_ks
 		median_ks = rc.median_ks
-		line = [Alignment, sp1, sp2, chr1, chr2, start1, end1, length1, start2, end2, length2, strand, N, mean_ks, median_ks, score, e_value]
+		line = [Alignment, sp1, sp2, chr1, chr2, strand, 
+					start1, end1, istart1, iend1, 
+					start2, end2, istart2, iend2, 
+					length1, length2, N, mean_ks, median_ks, score, e_value]
 		line = list(map(str, line))
 		print('\t'.join(line), file=outTsv)
 def gene_class(collinearity, inTsv, outTsv, byAlignment=True):