-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_gff.py
39 lines (34 loc) · 2.1 KB
/
read_gff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import urllib
from collections import namedtuple
import pandas as pd
from glob import glob
def read_gff(output_name):
global_dataframe = pd.DataFrame()
gffInfoFields = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
GFFRecord = namedtuple("GFFRecord", gffInfoFields)
folders = glob(r"C:\Users\nils\PycharmProjects\lab_project\results\*")
for folder in folders:
files = glob(folder + "\*")
for file in files:
if file.endswith("output.gff3") and not file.endswith("region_output.gff3"):
with open(file) as infile:
for line in infile:
parts = line.strip().split("\t")
if len(parts) > 2:
# If this fails, the file format is not standard-compatible
assert len(parts) == len(gffInfoFields)
# Normalize data
normalizedInfo = {
"seqid": None if parts[0] == "." else urllib.parse.unquote(parts[0]),
"source": None if parts[1] == "." else urllib.parse.unquote(parts[1]),
"type": None if parts[2] == "." else urllib.parse.unquote(parts[2]),
"start": None if parts[3] == "." else int(parts[3]),
"end": None if parts[4] == "." else int(parts[4]),
"score": None if parts[5] == "." else float(parts[5]),
"strand": None if parts[6] == "." else urllib.parse.unquote(parts[6]),
"phase": None if parts[7] == "." else urllib.parse.unquote(parts[7]),
# "attributes": parseGFFAttributes(parts[8])
}
local_dataframe = pd.DataFrame(normalizedInfo, index = [0])
global_dataframe = pd.concat([global_dataframe, local_dataframe])
global_dataframe.to_excel(output_name)