-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDB.py
executable file
·251 lines (231 loc) · 8.43 KB
/
DB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!/usr/bin/env python
import pymongo
from pprint import pprint
import re
import os
import sys
import math
from datetime import datetime
import ruamel.yaml
import traceback
import atexit
from random import shuffle
yaml = ruamel.yaml.YAML(typ="safe")
yaml.default_flow_style = False
def date_now():
"""
Needed to keep the same date in python and mongo, as mongo rounds to millisecond
"""
d = datetime.utcnow()
return d.replace(microsecond=math.floor(d.microsecond/1000)*1000)
CONNECTION = None
def close_connection():
global CONNECTION
if CONNECTION is not None:
CONNECTION.close()
atexit.register(close_connection)
def create_cgMLST_schema(name,loci):
data_dict = {
'name': name,
'loci': loci,
'length': len(loci)}
data_dict = dump_cgMLST_schema_info(data_dict)
return data_dict['_id']
def create_cgMLST_index(schema_id, capacity):
schema = get_cgMLST_schema([schema_id])[0]
schema_length = schema['length']
order = random.shuffle(range(schema_length))
data_dict = {
'schema_id': schema_id,
'order': order, # order in which loci are used in the binary tree
'capacity': capacity, # number of
'index': [] # binary tree (allele_number % 2)
}
data_dict = dump_cgMLST_index_info(data_dict)
return data_dict['_id']
def get_connection():
global CONNECTION
if CONNECTION is not None:
return CONNECTION
else:
mongo_db_key_location = os.getenv("BIFROST_DB_KEY", None)
print("location: '{}'".format(mongo_db_key_location))
if mongo_db_key_location == '':
print("Missing mongo db key",file=sys.stderr)
sys.exit(1)
with open(mongo_db_key_location, "r") as mongo_db_key_location_handle:
mongodb_url = mongo_db_key_location_handle.readline().strip()
# Return mongodb connection
CONNECTION = pymongo.MongoClient(mongodb_url)
return CONNECTION
def get_cgMLST_schema(schema_ids=None):
"""
Return cgMLST based on query
"""
query = []
if schema_ids is not None:
query.append({"_id": {"$in": schema_ids}})
connection = get_connection()
db = connection.get_database()
if len(query) == 0:
query = {}
else:
query = {"$and": query}
return list(db.cgmlst_schemas.find(query).sort([("_id", pymongo.DESCENDING)]))
def dump_cgMLST_schema_info(data_dict):
"""Insert sample dict into mongodb.
Return the dict with an _id element"""
connection = get_connection()
db = connection.get_database()
cgmlst_schema_db = db.cgmlst_schemas # Collection name is samples
now = date_now()
data_dict["metadata"] = data_dict.get("metadata", {})
data_dict["metadata"]["updated_at"] = now
if "_id" in data_dict:
data_dict = cgmlst_schema_db.find_one_and_update(
filter={"_id": data_dict["_id"]},
update={"$set": data_dict},
return_document=pymongo.ReturnDocument.AFTER, # return new doc if one is upserted
upsert=True # This might change in the future # insert the document if it does not exist
)
else:
data_dict["metadata"]["created_at"] = now
result = cgmlst_schema_db.insert_one(data_dict)
data_dict["_id"] = result.inserted_id
return data_dict
def delete_cgMLST_schema(component_id):
connection = get_connection()
db = connection.get_database()
deleted = db.cgmlst_schemas.delete_one({"_id": component_id})
return deleted.deleted_count
def dump_cgmlst_allele_info(data_dict):
"""Insert cgMLST allele dict into mongodb.
Return the dict with an _id element"""
connection = get_connection()
db = connection.get_database()
cgmlst_db = db.cgmlst_alleles # Collection name is cgmlst
now = date_now()
data_dict["metadata"] = data_dict.get("metadata", {'created_at': now})
data_dict["metadata"]["updated_at"] = now
if "_id" in data_dict:
data_dict = cgmlst_db.find_one_and_update(
filter={"_id": data_dict["_id"]},
update={"$set": data_dict},
return_document=pymongo.ReturnDocument.AFTER, # return new doc if one is upserted
upsert=True # This might change in the future. It doesnt make much sense with our current system.
# Import relies on this to be true.
# insert the document if it does not exist
)
else:
search_fields = {
"sample._id": data_dict["sample"]["_id"],
"cgmlst_schema._id": data_dict["cgmlst_schema"]["_id"],
}
data_dict = cgmlst_db.find_one_and_update(
filter=search_fields,
update={
"$set": data_dict
},
return_document=pymongo.ReturnDocument.AFTER, # return new doc if one is upserted
upsert=True # insert the document if it does not exist
)
return data_dict
def get_cgMLST_alleles(allele_ids=None):
"""
Return cgMLST alleles based on query
"""
query = []
if allele_ids is not None:
query.append({"_id": {"$in": allele_ids}})
connection = get_connection()
db = connection.get_database()
if len(query) == 0:
query = {}
else:
query = {"$and": query}
return list(db.cgmlst_alleles.find(query).sort([("_id", pymongo.DESCENDING)]))
def delete_cgMLST_alleles(component_id):
connection = get_connection()
db = connection.get_database()
deleted = db.cgmlst_alleles.delete_one({"_id": component_id})
return deleted.deleted_count
def dump_cgmlst_index_info(data_dict):
"""Insert cgMLST allele dict into mongodb.
Return the dict with an _id element"""
connection = get_connection()
db = connection.get_database()
cgmlst_db = db.cgmlst_index # Collection name is cgmlst_index
now = date_now()
data_dict["metadata"] = data_dict.get("metadata", {'created_at': now})
data_dict["metadata"]["updated_at"] = now
if "_id" in data_dict:
data_dict = cgmlst_db.find_one_and_update(
filter={"_id": data_dict["_id"]},
update={"$set": data_dict},
return_document=pymongo.ReturnDocument.AFTER, # return new doc if one is upserted
upsert=True # This might change in the future. It doesnt make much sense with our current system.
# Import relies on this to be true.
# insert the document if it does not exist
)
else:
search_fields = {
"cgmlst_schema._id": data_dict["cgmlst_schema"]["_id"],
}
data_dict = cgmlst_db.find_one_and_update(
filter=search_fields,
update={
"$set": data_dict
},
return_document=pymongo.ReturnDocument.AFTER, # return new doc if one is upserted
upsert=True # insert the document if it does not exist
)
return data_dict
def get_cgMLST_index(index_ids=None):
"""
Return cgMLST alleles based on query
"""
query = []
if index_ids is not None:
query.append({"_id": {"$in": index_ids}})
connection = get_connection()
db = connection.get_database()
if len(query) == 0:
query = {}
else:
query = {"$and": query}
return list(db.cgmlst_index.find(query).sort([("_id", pymongo.DESCENDING)]))
def delete_cgMLST_index(index_id):
connection = get_connection()
db = connection.get_database()
deleted = db.cgmlst_index.delete_one({"_id": index_id})
return deleted.deleted_count
if __name__ == "__main__":
## Test
get_connection()
data_dict = dump_cgMLST_schema_info({"test": "test"})
pprint(data_dict)
data_dict.update({"extra": 10})
data_dict = dump_cgMLST_schema_info(data_dict)
retrieved_schema = get_cgMLST_schema([data_dict["_id"]])
pprint(retrieved_schema)
schema_id = retrieved_schema[0]['_id']
cgMLST_alleles = {
'sample': {'_id': 12345},
'cgmlst_schema': {'_id': schema_id},
'alleles': [12, 235, 12]}
data_dict = dump_cgmlst_allele_info(cgMLST_alleles)
pprint(data_dict)
alleles_id = data_dict['_id']
cgMLST_index = {
'cgmlst_schema': {'_id': schema_id},
'type': "Trie",
'order': [0, 2, 1],
'index': {'12': {'12': {'235': [alleles_id]}}}
}
data_dict = dump_cgmlst_index_info(cgMLST_index)
index_id = data_dict['_id']
indices = get_cgMLST_index([index_id])
pprint(indices)
delete_cgMLST_index(index_id)
delete_cgMLST_alleles(alleles_id)
delete_cgMLST_index(index_id)