-
Notifications
You must be signed in to change notification settings - Fork 9
/
100_intersect_columns.py
executable file
·112 lines (96 loc) · 2.83 KB
/
100_intersect_columns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# 100_intersect_columns.py - /Users/vikas0633/Desktop/script/python/ - script to non-overlapping entries between the two columns
import os,sys,getopt, re
### Usage: python ~/Desktop/script/python/100_intersect_columns.py -i exp_val_in_db.txt -j 20121227_conserved_proteins_mRNA_headers.txt -c 4 -d 3 -s '|'
### main argument to
def options(argv):
global header
file1 = ''
file2 = ''
col1 = ''
col2 = ''
sep='\t'
first_line = False
header = ''
try:
opts, args = getopt.getopt(argv,"hi:j:c:d:s:f",["file1=","file2=","col1=","col2=",'separatedBy='])
except getopt.GetoptError:
print '''
python 100_intersect_columns.py
-i <file1>
-j <file2>
-c <col1> # multiple columns separated by commas
-d <col2> # multiple columns separated by commas
-s <separatedBy>
-f <first_line>
-h <help>
'''
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print '''
python 100_intersect_columns.py
-i <file1>
-j <file2>
-c <col1> # multiple columns separated by commas
-d <col2> # multiple columns separated by commas
-f <first_line> # Use this option to skip fist_line/header
-s <separatedBy>
-h <help>
'''
sys.exit(2)
elif opt in ("-i", "--file1"):
file1 = arg
elif opt in ("-j", "--file2"):
file2 = arg
elif opt in ("-c", "--col1"):
col1 = arg
elif opt in ("-d", "--col2"):
col2 = arg
elif opt in ("-s", "--separatedBy"):
sep = arg
elif opt in ("-f","--first_line"):
first_line = True
return file1, file2, col1, col2, sep, first_line
### hash the first file
def HASH(file1,c1,sep,first_line):
header = ''
hash = {}
for line in open(file1,'r'):
if len(line) > 0:
if line[0] != '#':
if first_line == False:
line = line.strip()
token = line.split(sep)
lis = list(col1.split(','))
key = ''
for i in lis:
key += '-'+token[int(i)-1]
hash[key] = line
if first_line == True:
header = line.strip()
first_line = False
return hash, header
### parse the second file
def PARSE(file2,c2,sep,first_line,hash, header):
for line in open(file2,'r'):
if len(line) > 1:
if line[0] != '#':
if first_line == False:
line = line.strip()
token = line.split(sep)
lis = list(col2.split(','))
key = ''
for i in lis:
key += '-'+token[int(i)-1]
#if key not in hash:
if key in hash:
print line + '\t' + hash[key]
if first_line == True:
print line.strip()+'\t'+header
first_line = False
if __name__ == "__main__":
file1, file2, col1, col2, sep, first_line = options(sys.argv[1:])
### hash the first file
hash,header = HASH(file1,col1,sep,first_line)
### parse the second file
PARSE(file2,col2,sep,first_line, hash, header)