-
Notifications
You must be signed in to change notification settings - Fork 0
/
svd_samples.py
364 lines (296 loc) · 11.1 KB
/
svd_samples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
#!/usr/bin/env python3
###############################################################
### svd_samples version 240724 ##
### (C) 2024 Adams Vallejos ##
###############################################################
import shutil
import mrcfile
import sys
import numpy
from numpy import pad, zeros, sqrt, matmul, matrix, array
from numpy.linalg import eig, svd
import os
from os.path import basename, splitext
from pandas import DataFrame, set_option
import matplotlib.pyplot as plt
# Use a class to retrieve all information in a table, the labels are the following
class MrcHeader:
def __init__(self):
self.nx = None
self.ny = None
self.nz = None
self.mode = None
self.nxstart = None
self.nystart = None
self.nzstart = None
self.mx = None
self.my = None
self.mz = None
self.cella = None
self.cellb = None
self.mapc = None
self.mapr = None
self.maps = None
self.dmin = None
self.dmax = None
self.dmean = None
self.ispg = None
self.nsymbt = None
self.extra1 = None
self.exttyp = None
self.nversion = None
self.extra2 = None
self.origin = None
self.map = None
self.machst = None
self.rms = None
self.nlabl = None
self.label = None
self.data = None
self.voxel_size = None
def parse(self, filename):
with mrcfile.open(filename,'r') as mrc:
# TODO: Set the proper type to every attribute
self.nx = int(mrc.header.nx)
self.ny = int(mrc.header.ny)
self.nz = int(mrc.header.nz)
self.mode = int(mrc.header.mode)
self.nxstart = int(mrc.header.nxstart)
self.nystart = int(mrc.header.nystart)
self.nzstart = int(mrc.header.nzstart)
self.mx = int(mrc.header.mx)
self.my = int(mrc.header.my)
self.mz = int(mrc.header.mz)
self.cella = array(mrc.header.cella)
self.cellb = mrc.header.cellb
self.mapc = int(mrc.header.mapc)
self.mapr = int(mrc.header.mapr)
self.maps = int(mrc.header.maps)
self.dmin = mrc.header.dmin
self.dmax = mrc.header.dmax
self.dmean = mrc.header.dmean
self.ispg = mrc.header.ispg
self.nsymbt = mrc.header.nsymbt
self.extra1 = mrc.header.extra1
self.exttyp = mrc.header.exttyp
self.nversion = mrc.header.nversion
self.extra2 = mrc.header.extra2
self.origin = mrc.header.origin
self.map = mrc.header.map
self.machst = mrc.header.machst
self.rms = mrc.header.rms
self.nlabl = mrc.header.nlabl
self.label = mrc.header.label
self.data = mrc.data
self.voxel_size = mrc.voxel_size
def get_data(self):
return self.data
def to_dict(self):
return {
'nx': self.nx,
'ny': self.ny,
'nz': self.nz,
'mode': self.mode,
'nxstart': self.nxstart,
'nystart': self.nystart,
'nzstart': self.nzstart,
'mx': self.mx,
'my': self.my,
'mz': self.mz,
'cella': self.cella,
'cellb': self.cellb,
'mapc': self.mapc,
'mapr': self.mapr,
'maps': self.maps,
'dmin': self.dmin,
'dmax': self.dmax,
'dmean': self.dmean,
'ispg': self.ispg,
'nsymbt': self.nsymbt,
'extra1': self.extra1,
'exttyp': self.exttyp,
'nversion': self.nversion,
'extra2': self.extra2,
'origin': self.origin,
'map': self.map,
'machst': self.machst,
'rms': self.rms,
'nlabl': self.nlabl,
'label': self.label,
'voxel_size': self.voxel_size,
}
# END OF HEADER CLASS
# Map dimensions nx, ny, nz These vary from map to map
# corresponding to map dimensions NC, NR, NS (fast axis, medium axis, slow axis)
# These are orientated According to MAPC, MAPR, MAPS keywords
# WHEN TAKING THE DATA FROM AN ORIGINAL MAP USE MODE 'r' ONLY
# START BY READING THE HEADERS
def set_workspace(workspace):
if not os.getcwd() == workspace:
os.chdir(workspace)
return f'Working on: {workspace}'
def maps_from_list(list):
files = []
with open(list) as lst:
for item in lst:
files.append(item.strip('\n'))
return files
def block_dimensions(list):
'''
Returns a dictionary of map of {axis: max dimension} from a list of ccp4 maps
list: lst
list of files to read
Map dimensions NX, NY, NZ vary from map to map,
corresponding to map dimensions NC, NR, NS (fast axis, medium axis, slow axis).
These are orientated According to MAPC, MAPR, MAPS keywords
'''
nc = 0
nr = 0
ns = 0
for i in range(len(list)):
# Close file after reading data to avoid overloading memory
with mrcfile.open(list[i],mode='r',header_only=True) as f:
nc = max(int((nc)), int(f.header.nx))
nr = max(int((nr)), int(f.header.ny))
ns = max(int((ns)), int(f.header.nz))
# Switch dimension labels 1,2,3 -> 0,1,2
mapc = f.header.mapc - 1
mapr = f.header.mapr - 1
maps = f.header.maps - 1
return dict([(mapc, nc), (mapr, nr), (maps, ns)])
def check_origin(list):
'''
Verifies if the origin of the coordinate system of all maps is to (0, 0, 0).
Example:
rec.array((0., 0., 0.),dtype=[('x', '<f4'), ('y', '<f4'), ('z', '<f4')])
Analogous to pandas table:
- Keys: 'x', 'y', 'z'
- type '<f4': Little endian 32 byte float, the least significant byte
(the "little end") of the data is placed at the byte
with the lowest address.
'''
for i in range(len(list)):
# Close file after reading data to avoid overloading memory
with mrcfile.open(list[i],mode='r',header_only=True) as f:
if i == 0:
origin = f.header.origin
if f.header.origin != origin:
raise TypeError('Unequal origin')
if origin.item() == (0,0,0):
return origin
raise TypeError('Origin not at (0, 0, 0)')
# Load data from list
def load_data(list, as_df = True):
'''
Returns
data: A list of 3D arrays from a list of ccp4 maps
headers: A pandas dataframe with information of all headers
'''
data = []
headers = []
for mrc_file in list:
header = MrcHeader()
header.parse(mrc_file)
headers.append(header.to_dict())
data.append(header.get_data())
if as_df:
headers = DataFrame(headers)
return headers, data
# Generate global variables according to header info:
def get_grid_specs(headers,swap_axes=None):
'''
Returns a dictionary of 'axis': units
and sets a number of global variables to preserve across all maps.
'''
# set the units to be preserved along all the maps
# Select the unit cell intervals from the first as a reference
if not headers.mx.eq(headers.mx[0]).all():
raise TypeError('Unequal sampling grid along X axis!')
if not headers.my.eq(headers.my[0]).all():
raise TypeError('Unequal sampling grid along Y axis!')
if not headers.mz.eq(headers.mz[0]).all():
raise TypeError('Unequal sampling grid along Z axis!')
MX, MY, MZ = headers.iloc[0]['mx'], headers.iloc[0]['my'], headers.iloc[0]['mz']
globals()['MX'], globals()['MY'], globals()['MZ'] = MX, MY, MZ
# Set the largest array dimension
NX, NY, NZ = headers['nx'].max(), headers['ny'].max(), headers['nz'].max()
globals()['NX'], globals()['NY'], globals()['NZ'] = NX, NY, NZ
print(f'These values will be output to the 3D grid:\n'
+ f'nx: {NX}, ny: {NY}, nz: {NZ}, \n'
+ f'mx: {MX}, my = {MY}, mz = {MZ} \n')
if swap_axes:
# This is added since (mapc, mapr, mapc) in maps exported
# from different programs do not do not give the proper
# ordering of the data array as described in the header.
return {int(swap_axes[i]) - 1: dim for i, dim in enumerate([NX, NY, NZ])}
return {headers.iloc[0]['mapc'] - 1: NX,
headers.iloc[0]['mapr'] - 1: NY,
headers.iloc[0]['maps'] - 1: NZ}
# PERFORM ARRAY PADDING
def pad_array(list_of_arrays):
padded_data = []
for arr in list_of_arrays:
# PAD WITH ZEROS AT THE START OF ARRAY
# pad_width = [(map_specs[i]-arr.shape[i],0) for i in range(3)]
# PAD WITH ZEROS AT THE END OF ARRAY
pad_width = [(0,map_specs[i]-arr.shape[i]) for i in range(3)]
padded_data.append(pad(array=arr, pad_width=pad_width, mode='constant', constant_values=0))
return padded_data
# WRITE THE DATA TO OUTPUT COPY OF REFERENCE
def write_map(new_data, in_map, out_map):
shutil.copy(in_map, out_map)
with mrcfile.open(out_map, 'r+') as f:
f.set_data(new_data)
f.update_header_from_data()
f.update_header_stats()
f.header.mx = MX
f.header.my = MY
f.header.mz = MZ
# --- USER INPUT PARAMETERS
LIGHT = "light"
RES='B'
WORKSPACE = f'/PATH/TO/RESAMPLING/output/{LIGHT}/{RES}'
out_dir = 'svd_FoFo'
print(f'Working on {WORKSPACE}')
set_workspace(WORKSPACE)
# Load a list of map data
# files = maps_from_list('../maps_fofo.lst')
files = []
for i in range(0,100):
files.append(f'{i:03}/analysis_fofo_0/merge_trunc_cad_uniq_fofo_map_coeffs.ccp4')
print(f'Reading maps grid specs')
try:
os.mkdir(out_dir)
print(f"Directory '{out_dir}' created ")
except FileExistsError:
print(f"Directory '{out_dir}' already exists")
headers, data = load_data(files)
# Set the values to be used across the maps
map_specs = get_grid_specs(headers,swap_axes=[3,2,1])
# map_specs
# PERFORM PADDING ACCORDING TO BLOCK SPECS
padded_data = pad_array(data)
print(f'Padding data arrays')
# DO SVD
# Generate a matrix of zeros of dimensions MAP SIZE x NUMBER OF MAPS
A = zeros((padded_data[0].size, len(files)))
for i in range(len(files)):
A[:,i] = padded_data[i].reshape(-1)
print(f'Calculating SVD')
# SVD method_1
cov = A.T @ A
L,V = eig(cov)
S1 = sqrt(L[0])
for i in range(A.shape[1]):
print(f'{i+1}/{A.shape[1]}', end='')
U1 = matmul(A,V[:,i])/S1
Ar = S1*(matrix(U1).T)*(matrix(V[:,i]))
# shutil.copy(files[i], f'{outdir}/{splitext(files[i])[0]}.ccp4')
S1 = sqrt(L[0])
U1 = matmul(A,V[:,0])/S1
Ar = S1*(matrix(U1).T)*(matrix(V[:,0]))
Ar = array(Ar).astype(numpy.float32)
# Copy a reference map and set the new data to it.
write_map(Ar[:,0].reshape(padded_data[0].shape), in_map=files[i], out_map=f'{out_dir}/svd_{splitext(basename(files[i]))[0]}.ccp4')
print('', end='\r')
print('JOB FINISHED!')