-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathplanttfdb.py
73 lines (51 loc) · 1.85 KB
/
planttfdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python3
import helper
import requests
from bs4 import BeautifulSoup
import pandas as pd
import gzip
def planttfdb(MSUID):
# Find the file
url = 'http://planttfdb.cbi.pku.edu.cn/download.php'
html_page = helper.connectionError(url)
soup = BeautifulSoup(html_page.content, "lxml")
# Find headers
for search in soup.findAll('table', { "id" : "oid_tfid" }):
for linkfound in search.findAll('a'):
if (linkfound.contents[0] == "Oryza sativa subsp. japonica"):
link = 'http://planttfdb.cbi.pku.edu.cn/'+linkfound.get('href')
break
# Give the entire name of the file with the extension .gz
filename = link.split("/")[-1]
# Give the name of the file without .gz
uncompressName = filename[:-3] + ".txt"
pathToFile = helper.formatPathToFile(uncompressName)
# Test existant file
if(not helper.existFile(pathToFile)):
print("on telecharege")
# Fetch the file by the url and decompress it
r = requests.get(link)
decompressedFile = gzip.decompress(r.content)
# Create the file .txt
with open(pathToFile, "wb") as f:
f.write(decompressedFile)
f.close()
# Use the previous created file (.txt)
with open(pathToFile, "r+b") as file:
# Import file tab-delimited
try:
array = pd.read_csv(file, sep="\t", header=None)
except pd.io.common.EmptyError:
array = pd.DataFrame()
# Named columns
array.columns = ["TF_ID", "Gene_ID", "Family"]
data = array.loc[array['TF_ID'] == MSUID]
if (not data.empty):
return data
else:
data = array.loc[array['Gene_ID'] == MSUID]
if (data.empty):
return False
else:
hashmap = {"Family": data["Family"].values[0]}
return hashmap