-
Notifications
You must be signed in to change notification settings - Fork 0
/
CQgff3_to_EMVgff3.py
84 lines (65 loc) · 2.69 KB
/
CQgff3_to_EMVgff3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import argparse
import pandas as pd
import re
# the warnings are switched off. These are raised at line 36 and 39 for
# how the columns of the df is renamed
import warnings
warnings.filterwarnings('ignore')
parser = argparse.ArgumentParser(
description="""
This converts a Coding_Quarry GFF3 file to a format compatible with Evidence modler.
It requires the following modules.
pandas, re, argparse.
""")
parser.add_argument('CQ_in_file', help="Name of CQ_gff3_file", type=str)
args = parser.parse_args()
CQ_file = args.CQ_in_file
# here starts the main script reading in the file using pandas and making
# a copy of it
def add_column_9(x):
"""Function to add a 9th column to assure assocation across different contigs.
Needs to be deleted at the end again."""
match = re.search('(?<=ID=)\w+;', x.replace('.', '_').replace('CDS:', '')
) # remeber that '.' are not part of the \w+ characters
gene_id = match.group(0)[:-1]
return gene_id
CQ_df = pd.read_csv(CQ_file, header=None, sep="\t")
EVM_df = CQ_df.iloc[:, :]
EVM_df[9] = EVM_df[8].apply(add_column_9)
gdf = EVM_df[EVM_df[2] == 'gene'] # add the exon part for EVM df
gdf[2] = 'fmRNA'
edf = EVM_df[EVM_df[2] == 'CDS']
edf[2] = 'exon'
EVM_df = EVM_df.append(gdf).append(edf) # make the EVM dataframe
# EVM_df.loc[:,7] ='.' #first move to make columne 7 a '.' instead of frame
EVM_df.loc[:, 1] = 'CodingQuarry_v2'
# now after we generated the overall EVM dataframe we need to bring the house in order and sort correctly
# before we iterate over the each row to adapt the format
# adding row #9 ensures that genes that belong together
EVM_df.sort_values(by=[0, 9, 3, 2], ascending=[0, 0, 1, 0], inplace=True)
# stay together
EVM_df.reset_index(drop=True, inplace=True)
name = 'Name=CodingQuarry_v2_prediction'
# go over the whole data frame and change the nameing in columne 8
# according to EVM syntax
for x, y in EVM_df.iterrows():
if y[2] == 'gene':
gene_id = ''
counter = 1
gene_id = y[0] + '-' + y[9]
EVM_df.iloc[x, 8] = 'ID=' + gene_id + ';' + name + '_' + gene_id
if y[2] == 'fmRNA':
EVM_df.iloc[x, 8] = 'ID=' + gene_id + '.t1;Parent=' + \
gene_id + ';' + name + '_' + gene_id
EVM_df.iloc[x, 2] = 'mRNA'
if y[2] == 'exon':
EVM_df.iloc[x, 8] = 'ID=' + gene_id + '.t1.exon' + \
str(counter) + ';Parent=' + gene_id + '.t1'
counter += 1
if y[2] == 'CDS':
EVM_df.iloc[x, 8] = 'ID=cds.' + gene_id + \
'.t1;Parent=' + gene_id + '.t1'
outname = CQ_file.split('.')[0] + '.EVM.gff3'
# now safe it and move on
EVM_df.iloc[:, 0:9].to_csv(outname, sep='\t', header=None, index=None)
exit()