forked from stajichlab/coding_challenge
-
Notifications
You must be signed in to change notification settings - Fork 0
/
featuresanalyzer.py
72 lines (49 loc) · 2.14 KB
/
featuresanalyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python3
# upload modules
import pandas as pd
import numpy as np
import re
# open file
content = pd.read_csv('/Carnegie/DPB/Data/Shared/Labs/Dinneny/Private/Ysun/2020/challenge/features.bed',sep='\t',names=["Chr", "Start", "End", "Features"])
print("This is what is in the file:")
print (content)
#How many features are in the file?
print("How many features are in the file?")
# count number of unique names in column 4
# python starts at 0, so to get the 4th column, it's 3 instead of 4
# len counts how many
# drop_duplicates removes duplicates
content.drop_duplicates(subset=["Features"], inplace=True)
# len is used to count
print(len(content.iloc[:,[3]]))
#How many genes are in the file?
print("How many genes are in the file?")
# found solution here https://stackoverflow.com/questions/51027453/count-how-many-times-a-column-contains-a-certain-value-in-pandas
columaftersplit = content["Features"].str.split('-', expand=True).stack().value_counts()
print (columaftersplit [0])
#What are the types of features?
print("What are the types of features?")
featuretype = content["Features"].str.split('-', expand=True)
verticallist = list(featuretype[0].unique())
# to print vertically
print(*verticallist, sep = "\n")
#How many of each type of feature?
print("How many of each type of feature?")
countfeaturetype = content["Features"].str.split('-', expand=True)
countfeaturetype.columns = ["Feature", "Featuretype"]
countfeatureoutput = countfeaturetype.groupby("Feature").count()
print (countfeatureoutput)
#How many bases are contained in each type of feature?
print("How many bases are contained in each type of feature?")
content["# of bases"] = abs(content["End"] - content["Start"])
#print (content)
# this will get you the 2 columns, not 3:4
basespairs = content.iloc[:,3:5]
print("\n# of bases for each type of feature :\n", basespairs)
#Print out the Genes in order of largest to smallest.
print("Print out the Genes in order of largest to smallest")
search ="Gene"
bool_series = basespairs["Features"].str.startswith(search)
Geneonly = basespairs[bool_series]
final = Geneonly.sort_values(by="# of bases",ascending=False)
print (final)