-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathdata.py
524 lines (442 loc) · 19.6 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
import pickle
import numpy as np
from openbabel import pybel
from math import ceil, sin, cos, sqrt, pi
from itertools import combinations
class Featurizer():
"""Calcaulates atomic features for molecules. Features can encode atom type,
native pybel properties or any property defined with SMARTS patterns
Attributes
----------
FEATURE_NAMES: list of strings
Labels for features (in the same order as features)
NUM_ATOM_CLASSES: int
Number of atom codes
ATOM_CODES: dict
Dictionary mapping atomic numbers to codes
NAMED_PROPS: list of string
Names of atomic properties to retrieve from pybel.Atom object
CALLABLES: list of callables
Callables used to calculcate custom atomic properties
SMARTS: list of SMARTS strings
SMARTS patterns defining additional atomic properties
"""
def __init__(self, atom_codes=None, atom_labels=None,
named_properties=None, save_molecule_codes=True,
custom_properties=None, smarts_properties=None,
smarts_labels=None):
"""Creates Featurizer with specified types of features. Elements of a
feature vector will be in a following order: atom type encoding
(defined by atom_codes), Pybel atomic properties (defined by
named_properties), molecule code (if present), custom atomic properties
(defined `custom_properties`), and additional properties defined with
SMARTS (defined with `smarts_properties`).
Parameters
----------
atom_codes: dict, optional
Dictionary mapping atomic numbers to codes. It will be used for
one-hot encoging therefore if n different types are used, codes
shpuld be from 0 to n-1. Multiple atoms can have the same code,
e.g. you can use {6: 0, 7: 1, 8: 1} to encode carbons with [1, 0]
and nitrogens and oxygens with [0, 1] vectors. If not provided,
default encoding is used.
atom_labels: list of strings, optional
Labels for atoms codes. It should have the same length as the
number of used codes, e.g. for `atom_codes={6: 0, 7: 1, 8: 1}` you
should provide something like ['C', 'O or N']. If not specified
labels 'atom0', 'atom1' etc are used. If `atom_codes` is not
specified this argument is ignored.
named_properties: list of strings, optional
Names of atomic properties to retrieve from pybel.Atom object. If
not specified ['hyb', 'heavyvalence', 'heterovalence',
'partialcharge'] is used.
save_molecule_codes: bool, optional (default True)
If set to True, there will be an additional feature to save
molecule code. It is usefeul when saving molecular complex in a
single array.
custom_properties: list of callables, optional
Custom functions to calculate atomic properties. Each element of
this list should be a callable that takes pybel.Atom object and
returns a float. If callable has `__name__` property it is used as
feature label. Otherwise labels 'func<i>' etc are used, where i is
the index in `custom_properties` list.
smarts_properties: list of strings, optional
Additional atomic properties defined with SMARTS patterns. These
patterns should match a single atom. If not specified, deafult
patterns are used.
smarts_labels: list of strings, optional
Labels for properties defined with SMARTS. Should have the same
length as `smarts_properties`. If not specified labels 'smarts0',
'smarts1' etc are used. If `smarts_properties` is not specified
this argument is ignored.
"""
# Remember namse of all features in the correct order
self.FEATURE_NAMES = []
if atom_codes is not None:
if not isinstance(atom_codes, dict):
raise TypeError('Atom codes should be dict, got %s instead'
% type(atom_codes))
codes = set(atom_codes.values())
for i in range(len(codes)):
if i not in codes:
raise ValueError('Incorrect atom code %s' % i)
self.NUM_ATOM_CLASSES = len(codes)
self.ATOM_CODES = atom_codes
if atom_labels is not None:
if len(atom_labels) != self.NUM_ATOM_CLASSES:
raise ValueError('Incorrect number of atom labels: '
'%s instead of %s'
% (len(atom_labels), self.NUM_ATOM_CLASSES))
else:
atom_labels = ['atom%s' % i for i in range(self.NUM_ATOM_CLASSES)]
self.FEATURE_NAMES += atom_labels
else:
self.ATOM_CODES = {}
metals = ([3, 4, 11, 12, 13] + list(range(19, 32))
+ list(range(37, 51)) + list(range(55, 84))
+ list(range(87, 104)))
# List of tuples (atomic_num, class_name) with atom types to encode.
atom_classes = [
(5, 'B'),
(6, 'C'),
(7, 'N'),
(8, 'O'),
(15, 'P'),
(16, 'S'),
(34, 'Se'),
([9, 17, 35, 53], 'halogen'),
(metals, 'metal')
]
for code, (atom, name) in enumerate(atom_classes):
if type(atom) is list:
for a in atom:
self.ATOM_CODES[a] = code
else:
self.ATOM_CODES[atom] = code
self.FEATURE_NAMES.append(name)
self.NUM_ATOM_CLASSES = len(atom_classes)
if named_properties is not None:
if not isinstance(named_properties, (list, tuple, np.ndarray)):
raise TypeError('named_properties must be a list')
allowed_props = [prop for prop in dir(pybel.Atom)
if not prop.startswith('__')]
for prop_id, prop in enumerate(named_properties):
if prop not in allowed_props:
raise ValueError(
'named_properties must be in pybel.Atom attributes,'
' %s was given at position %s' % (prop_id, prop)
)
self.NAMED_PROPS = named_properties
else:
# pybel.Atom properties to save
self.NAMED_PROPS = ['hyb', 'heavydegree', 'heterodegree',
'partialcharge']
self.FEATURE_NAMES += self.NAMED_PROPS
if not isinstance(save_molecule_codes, bool):
raise TypeError('save_molecule_codes should be bool, got %s '
'instead' % type(save_molecule_codes))
self.save_molecule_codes = save_molecule_codes
if save_molecule_codes:
# Remember if an atom belongs to the ligand or to the protein
self.FEATURE_NAMES.append('molcode')
self.CALLABLES = []
if custom_properties is not None:
for i, func in enumerate(custom_properties):
if not callable(func):
raise TypeError('custom_properties should be list of'
' callables, got %s instead' % type(func))
name = getattr(func, '__name__', '')
if name == '':
name = 'func%s' % i
self.CALLABLES.append(func)
self.FEATURE_NAMES.append(name)
if smarts_properties is None:
# SMARTS definition for other properties
self.SMARTS = [
'[#6+0!$(*~[#7,#8,F]),SH0+0v2,s+0,S^3,Cl+0,Br+0,I+0]',
'[a]',
'[!$([#1,#6,F,Cl,Br,I,o,s,nX3,#7v5,#15v5,#16v4,#16v6,*+1,*+2,*+3])]',
'[!$([#6,H0,-,-2,-3]),$([!H0;#7,#8,#9])]',
'[r]'
]
smarts_labels = ['hydrophobic', 'aromatic', 'acceptor', 'donor',
'ring']
elif not isinstance(smarts_properties, (list, tuple, np.ndarray)):
raise TypeError('smarts_properties must be a list')
else:
self.SMARTS = smarts_properties
if smarts_labels is not None:
if len(smarts_labels) != len(self.SMARTS):
raise ValueError('Incorrect number of SMARTS labels: %s'
' instead of %s'
% (len(smarts_labels), len(self.SMARTS)))
else:
smarts_labels = ['smarts%s' % i for i in range(len(self.SMARTS))]
# Compile patterns
self.compile_smarts()
self.FEATURE_NAMES += smarts_labels
def compile_smarts(self):
self.__PATTERNS = []
for smarts in self.SMARTS:
self.__PATTERNS.append(pybel.Smarts(smarts))
def encode_num(self, atomic_num):
"""Encode atom type with a binary vector. If atom type is not included in
the `atom_classes`, its encoding is an all-zeros vector.
Parameters
----------
atomic_num: int
Atomic number
Returns
-------
encoding: np.ndarray
Binary vector encoding atom type (one-hot or null).
"""
if not isinstance(atomic_num, int):
raise TypeError('Atomic number must be int, %s was given'
% type(atomic_num))
encoding = np.zeros(self.NUM_ATOM_CLASSES)
try:
encoding[self.ATOM_CODES[atomic_num]] = 1.0
except:
pass
return encoding
def find_smarts(self, molecule):
"""Find atoms that match SMARTS patterns.
Parameters
----------
molecule: pybel.Molecule
Returns
-------
features: np.ndarray
NxM binary array, where N is the number of atoms in the `molecule`
and M is the number of patterns. `features[i, j]` == 1.0 if i'th
atom has j'th property
"""
if not isinstance(molecule, pybel.Molecule):
raise TypeError('molecule must be pybel.Molecule object, %s was given'
% type(molecule))
features = np.zeros((len(molecule.atoms), len(self.__PATTERNS)))
for (pattern_id, pattern) in enumerate(self.__PATTERNS):
atoms_with_prop = np.array(list(*zip(*pattern.findall(molecule))),
dtype=int) - 1
features[atoms_with_prop, pattern_id] = 1.0
return features
def get_features(self, molecule, molcode=None):
"""Get coordinates and features for all heavy atoms in the molecule.
Parameters
----------
molecule: pybel.Molecule
molcode: float, optional
Molecule type. You can use it to encode whether an atom belongs to
the ligand (1.0) or to the protein (-1.0) etc.
Returns
-------
coords: np.ndarray, shape = (N, 3)
Coordinates of all heavy atoms in the `molecule`.
features: np.ndarray, shape = (N, F)
Features of all heavy atoms in the `molecule`: atom type
(one-hot encoding), pybel.Atom attributes, type of a molecule
(e.g protein/ligand distinction), and other properties defined with
SMARTS patterns
"""
if not isinstance(molecule, pybel.Molecule):
raise TypeError('molecule must be pybel.Molecule object,'
' %s was given' % type(molecule))
if molcode is None:
if self.save_molecule_codes is True:
raise ValueError('save_molecule_codes is set to True,'
' you must specify code for the molecule')
elif not isinstance(molcode, (float, int)):
raise TypeError('motlype must be float, %s was given'
% type(molcode))
coords = []
features = []
heavy_atoms = []
for i, atom in enumerate(molecule):
# ignore hydrogens and dummy atoms (they have atomicnum set to 0)
if atom.atomicnum > 1:
heavy_atoms.append(i)
coords.append(atom.coords)
features.append(np.concatenate((
self.encode_num(atom.atomicnum),
[atom.__getattribute__(prop) for prop in self.NAMED_PROPS],
[func(atom) for func in self.CALLABLES],
)))
coords = np.array(coords, dtype=np.float32)
features = np.array(features, dtype=np.float32)
if self.save_molecule_codes:
features = np.hstack((features,
molcode * np.ones((len(features), 1))))
features = np.hstack([features,
self.find_smarts(molecule)[heavy_atoms]])
if np.isnan(features).any():
raise RuntimeError('Got NaN when calculating features')
return coords, features
def to_pickle(self, fname='featurizer.pkl'):
"""Save featurizer in a given file. Featurizer can be restored with
`from_pickle` method.
Parameters
----------
fname: str, optional
Path to file in which featurizer will be saved
"""
# patterns can't be pickled, we need to temporarily remove them
patterns = self.__PATTERNS[:]
del self.__PATTERNS
try:
with open(fname, 'wb') as f:
pickle.dump(self, f)
finally:
self.__PATTERNS = patterns[:]
@staticmethod
def from_pickle(fname):
"""Load pickled featurizer from a given file
Parameters
----------
fname: str, optional
Path to file with saved featurizer
Returns
-------
featurizer: Featurizer object
Loaded featurizer
"""
with open(fname, 'rb') as f:
featurizer = pickle.load(f)
featurizer.compile_smarts()
return featurizer
def rotation_matrix(axis, theta):
"""Counterclockwise rotation about a given axis by theta radians"""
if not isinstance(axis, (np.ndarray, list, tuple)):
raise TypeError('axis must be an array of floats of shape (3,)')
try:
axis = np.asarray(axis, dtype=np.float)
except ValueError:
raise ValueError('axis must be an array of floats of shape (3,)')
if axis.shape != (3,):
raise ValueError('axis must be an array of floats of shape (3,)')
if not isinstance(theta, (float, int)):
raise TypeError('theta must be a float')
axis = axis / sqrt(np.dot(axis, axis))
a = cos(theta / 2.0)
b, c, d = -axis * sin(theta / 2.0)
aa, bb, cc, dd = a * a, b * b, c * c, d * d
bc, ad, ac, ab, bd, cd = b * c, a * d, a * c, a * b, b * d, c * d
return np.array([[aa + bb - cc - dd, 2 * (bc + ad), 2 * (bd - ac)],
[2 * (bc - ad), aa + cc - bb - dd, 2 * (cd + ab)],
[2 * (bd + ac), 2 * (cd - ab), aa + dd - bb - cc]])
# Create matrices for all possible 90* rotations of a box
ROTATIONS = [rotation_matrix([1, 1, 1], 0)]
# about X, Y and Z - 9 rotations
for a1 in range(3):
for t in range(1, 4):
axis = np.zeros(3)
axis[a1] = 1
theta = t * pi / 2.0
ROTATIONS.append(rotation_matrix(axis, theta))
# about each face diagonal - 6 rotations
for (a1, a2) in combinations(range(3), 2):
axis = np.zeros(3)
axis[[a1, a2]] = 1.0
theta = pi
ROTATIONS.append(rotation_matrix(axis, theta))
axis[a2] = -1.0
ROTATIONS.append(rotation_matrix(axis, theta))
# about each space diagonal - 8 rotations
for t in [1, 2]:
theta = t * 2 * pi / 3
axis = np.ones(3)
ROTATIONS.append(rotation_matrix(axis, theta))
for a1 in range(3):
axis = np.ones(3)
axis[a1] = -1
ROTATIONS.append(rotation_matrix(axis, theta))
def rotate(coords, rotation):
"""Rotate coordinates by a given rotation
Parameters
----------
coords: array-like, shape (N, 3)
Arrays with coordinates and features for each atoms.
rotation: int or array-like, shape (3, 3)
Rotation to perform. You can either select predefined rotation by
giving its index or specify rotation matrix.
Returns
-------
coords: np.ndarray, shape = (N, 3)
Rotated coordinates.
"""
global ROTATIONS
if not isinstance(coords, (np.ndarray, list, tuple)):
raise TypeError('coords must be an array of floats of shape (N, 3)')
try:
coords = np.asarray(coords, dtype=np.float)
except ValueError:
raise ValueError('coords must be an array of floats of shape (N, 3)')
shape = coords.shape
if len(shape) != 2 or shape[1] != 3:
raise ValueError('coords must be an array of floats of shape (N, 3)')
if isinstance(rotation, int):
if rotation >= 0 and rotation < len(ROTATIONS):
return np.dot(coords, ROTATIONS[rotation])
else:
raise ValueError('Invalid rotation number %s!' % rotation)
elif isinstance(rotation, np.ndarray) and rotation.shape == (3, 3):
return np.dot(coords, rotation)
else:
raise ValueError('Invalid rotation %s!' % rotation)
# TODO: add make_grid variant for GPU
def make_grid(coords, features, grid_resolution=1.0, max_dist=10.0):
"""Convert atom coordinates and features represented as 2D arrays into a
fixed-sized 3D box.
Parameters
----------
coords, features: array-likes, shape (N, 3) and (N, F)
Arrays with coordinates and features for each atoms.
grid_resolution: float, optional
Resolution of a grid (in Angstroms).
max_dist: float, optional
Maximum distance between atom and box center. Resulting box has size of
2*`max_dist`+1 Angstroms and atoms that are too far away are not
included.
Returns
-------
coords: np.ndarray, shape = (M, M, M, F)
4D array with atom properties distributed in 3D space. M is equal to
2 * `max_dist` / `grid_resolution` + 1
"""
try:
coords = np.asarray(coords, dtype=np.float)
except ValueError:
raise ValueError('coords must be an array of floats of shape (N, 3)')
c_shape = coords.shape
if len(c_shape) != 2 or c_shape[1] != 3:
raise ValueError('coords must be an array of floats of shape (N, 3)')
N = len(coords)
try:
features = np.asarray(features, dtype=np.float)
except ValueError:
raise ValueError('features must be an array of floats of shape (N, F)')
f_shape = features.shape
if len(f_shape) != 2 or f_shape[0] != N:
raise ValueError('features must be an array of floats of shape (N, F)')
if not isinstance(grid_resolution, (float, int)):
raise TypeError('grid_resolution must be float')
if grid_resolution <= 0:
raise ValueError('grid_resolution must be positive')
if not isinstance(max_dist, (float, int)):
raise TypeError('max_dist must be float')
if max_dist <= 0:
raise ValueError('max_dist must be positive')
num_features = f_shape[1]
max_dist = float(max_dist)
grid_resolution = float(grid_resolution)
box_size = ceil(2 * max_dist / grid_resolution + 1)
# move all atoms to the neares grid point
grid_coords = (coords + max_dist) / grid_resolution
grid_coords = grid_coords.round().astype(int)
# remove atoms outside the box
in_box = ((grid_coords >= 0) & (grid_coords < box_size)).all(axis=1)
grid = np.zeros((1, box_size, box_size, box_size, num_features),
dtype=np.float32)
for (x, y, z), f in zip(grid_coords[in_box], features[in_box]):
grid[0, x, y, z] += f
return grid