-
Notifications
You must be signed in to change notification settings - Fork 13
/
niid_upload.py
189 lines (169 loc) · 8.5 KB
/
niid_upload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import os, re, time, datetime, csv, sys, json
from upload import upload
from rethinkdb import r
from Bio import SeqIO
import argparse
import subprocess
import unicodedata
from parse import parse
import xlrd
from upload import parser
sys.path.append('') # need to import from base
from base.rethink_io import rethink_io
from vdb.flu_upload import flu_upload
from titer_block import find_titer_block, find_serum_rows, find_virus_columns
parser.add_argument('--assay_type', default='hi')
def read_niid(path, fstem, subtype, assay_type):
'''
Convert xls tables to csv tables, then parse to flat tsv
'''
possible_files = [ path + '/' + fstem + ext for ext in ['.xls', '.xlsm', '.xlsx']]
real_file = ''
for possible_file in possible_files:
if os.path.isfile(possible_file):
real_file = possible_file
if real_file != '':
print("real_file: " + real_file)
ind = '.{}'.format(real_file.split('.')[-1])
convert_niid_xls_to_tsv(path, fstem, ind, subtype, assay_type)
def convert_niid_xls_to_tsv(path, fstem, ind, subtype, assay_type):
# Set flutype
suptype=subtype.lower()
flutype = ""
if subtype == "h3n2" or subtype == "h1n1pdm":
flutype = "A"
if subtype == "vic" or subtype == "yam":
flutype = "B"
# Set NIID patterns
virus_pattern = r"[A-Z]/[\w\s-]+/.+/\d{4}"
virus_passage_pattern = r"(MDCK|SIAT|E\d+|hCK)"
serum_id_pattern = r".+(No\.|no\.).+"
serum_passage_pattern = r".+(Egg|Cell).+"
serum_abbrev_pattern = r"\w+\s{0,1}\w+/\d+.*"
crick = False
# Open workbook
wb_name = path + '/' + fstem + ind
workbook = xlrd.open_workbook(filename=wb_name, encoding_override="cp1252")
for worksheet_index, worksheet in enumerate(workbook.sheets(), start=1):
print(f"Reading worksheet {worksheet_index} '{worksheet.name}' in file '{fstem}'")
# autodetecting titer, virus, serum blocks
titer_block = find_titer_block(worksheet)
if len(titer_block["col_start"]) == 0:
print("No titer block found.")
break
titer_coords = {
'col_start': titer_block["col_start"][0][0],
'col_end': titer_block["col_end"][0][0],
'row_start': titer_block["row_start"][0][0],
'row_end': titer_block["row_end"][0][0]
}
virus_block = find_virus_columns(
worksheet=worksheet,
titer_coords=titer_coords,
virus_pattern=virus_pattern,
virus_passage_pattern=virus_passage_pattern,
)
# If no virus names are found, might not be a valid worksheet, skip worksheet to avoid breaking find_serum_rows
if virus_block["virus_names"] is None:
print(f"Virus names not found. Check the virus pattern: '{virus_pattern}'")
break
serum_block = find_serum_rows(
worksheet=worksheet,
titer_coords=titer_coords,
virus_names=virus_block["virus_names"],
serum_id_pattern=serum_id_pattern,
serum_passage_pattern=serum_passage_pattern,
serum_abbrev_pattern=serum_abbrev_pattern,
crick=crick,
)
# Print the most likely row and column indices for the titer block
print(f"Titer block: n = {titer_block['row_start'][0][1]}x{titer_block['col_start'][0][1]} = {titer_block['row_start'][0][1]*titer_block['col_start'][0][1]}")
print(f" Most likely (n={titer_block['col_start'][0][1]}) col_start: {titer_block['col_start'][0][0]}")
print(f" Most likely (n={titer_block['col_end'][0][1]}) col_end: {titer_block['col_end'][0][0]}")
print(f" Most likely (n={titer_block['row_start'][0][1]}) row_start: {titer_block['row_start'][0][0]}")
print(f" Most likely (n={titer_block['row_end'][0][1]}) row_end: {titer_block['row_end'][0][0]}")
# For debugging purposes, print alternative indices (e.g. col_start, col_end, row_start, row_end)
# print("Alternative indices:")
# for i in range(1, len(titer_block['row_start'])):
# print(f" Alternative (n={titer_block['row_start'][i][1]}) row_start: {titer_block['row_start'][i][0]}")
# Print Virus and Serum annotations row and column indices
print("Virus (antigen) block: left and right of the titer block")
print(f" virus column index: {virus_block['virus_col_idx']}")
print(f" virus passage column index: {virus_block['virus_passage_col_idx']}")
print(f" virus names: {virus_block['virus_names']}")
print("Serum (antisera) block: above the titer block")
print(f" serum ID row index: {serum_block['serum_id_row_idx']}")
print("Serum strain and serum passage will be parsed from serum ID row")
mat = worksheet
with open('data/tmp/%s.tsv'%(fstem), 'w') as outfile:
header = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"]
outfile.write("%s\n" % ("\t".join(header)))
serum_id_row_index = serum_block['serum_id_row_idx']
row_start = titer_coords['row_start']
row_end = titer_coords['row_end']
virus_id_col_index = virus_block['virus_col_idx']
virus_passage_col_index=virus_block['virus_passage_col_idx']
col_start = titer_coords['col_start']
col_end = titer_coords['col_end']
for i in range(row_start, row_end+1):
for j in range(col_start, col_end+1):
virus_strain = str(mat.cell_value(i,virus_id_col_index)).strip()
serum_id = str(mat.cell_value(serum_id_row_index,j)).strip().replace(' ','')
serum_id = re.sub(r'[\r\n ]+', '', serum_id)
m = re.search(r'^(\S+)(egg|cell|siat|hck|nib121|ivr|\(bvr)', serum_id, re.IGNORECASE)
if m is None:
m = re.search(r'^(\S+)(no\.)', serum_id, re.IGNORECASE)
serum_strain = ""
if m:
serum_strain = m.group(1)
if not serum_strain.startswith(flutype + "/"):
serum_strain = flutype + "/" + serum_strain
# Normalize U+ff1c '<' to U+003c '<'
titer = unicodedata.normalize('NFKC', str(mat.cell_value(i,j)).strip())
# Allow either "< 10" or "<10"
titer = re.sub(r'< ', '<', titer)
source = "niid_%s"%(fstem).strip()
virus_passage = str(mat.cell_value(i,virus_passage_col_index)).strip()
virus_passage_category = ''
serum_passage = "unknown"
m = re.search(r'(egg)', serum_id, re.IGNORECASE)
if m:
serum_passage = m.group(1)
m = re.search(r'(cell|siat|hck)', serum_id, re.IGNORECASE)
if m:
serum_passage = m.group(1)
serum_passage_category = ''
line = "%s\n" % ("\t".join([ virus_strain, serum_strain, serum_id, titer, source, virus_passage, virus_passage_category, serum_passage, serum_passage_category, assay_type]))
outfile.write(line)
def determine_subtype(original_path):
original_path = original_path.lower().split('/')
if 'h3n2' in original_path:
subtype = 'h3n2'
elif 'h1n1pdm' in original_path:
subtype = 'h1n1pdm'
elif 'victoria' in original_path:
subtype = 'vic'
elif 'yamagata' in original_path:
subtype = 'yam'
else:
subtype = "UnknownSubtype"
return subtype
if __name__=="__main__":
args = parser.parse_args()
if args.path is None:
args.path = "data/"
if args.database is None:
args.database = "niid_tdb"
if not os.path.isdir(args.path):
os.makedirs(args.path)
subtype = determine_subtype(args.path)
read_niid(args.path, args.fstem, subtype, args.assay_type)
args.fstem = args.fstem.replace('(','\\(').replace(')','\\)')
if args.preview:
command = "python tdb/elife_upload.py -db " + args.database + " --subtype " + subtype + " --path data/tmp/ --fstem " + args.fstem + " --preview"
print(command)
subprocess.call(command, shell=True)
else:
command = "python tdb/elife_upload.py -db " + args.database + " --subtype " + subtype + " --path data/tmp/ --fstem " + args.fstem
print(command)
subprocess.call(command, shell=True)