forked from tirthajyoti/Design-of-experiment-Python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
DOE_functions.py
548 lines (430 loc) · 26.1 KB
/
DOE_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
#====================
# Essential imports
#====================
from pyDOE import *
from pyDOE_corrected import *
from diversipy import *
import pandas as pd
import numpy as np
# ===========================================================================================================
# Function for constructing a DataFrame from a numpy array generated by PyDOE function and individual lists
# ===========================================================================================================
def construct_df(x,r):
df=pd.DataFrame(data=x,dtype='float32')
for i in df.index:
for j in range(len(list(df.iloc[i]))):
df.iloc[i][j]=r[j][int(df.iloc[i][j])]
return df
# ===================================================================================================
# Function for constructing a DataFrame from a matrix with floating point numbers between -1 and +1
# ===================================================================================================
def construct_df_from_matrix(x,factor_array):
"""
This function constructs a DataFrame out of x and factor_array, both of which are assumed to be numpy arrays.
It projects the numbers in the x (which is output of a design-of-experiment build) to the factor array ranges.
Here factor_array is assumed to have only min and max ranges.
Matrix x is assumed to have numbers ranging from -1 to 1.
"""
row_num=x.shape[0] # Number of rows in the matrix x
col_num=x.shape[1] # Number of columns in the matrix x
empty=np.zeros((row_num,col_num))
def simple_substitution(idx,factor_list):
if idx==-1:
return factor_list[0]
elif idx==0:
return factor_list[1]
elif idx==1:
return factor_list[2]
else:
alpha=np.abs(factor_list[2]-factor_list[0])/2
if idx<0:
beta=np.abs(idx)-1
return factor_list[0]-(beta*alpha)
else:
beta=idx-1
return factor_list[2]+(beta*alpha)
for i in range(row_num):
for j in range(col_num):
empty[i,j] = simple_substitution(x[i,j],factor_array[j])
return pd.DataFrame(data=empty)
# =================================================================================================
# Function for constructing a DataFrame from a matrix with floating point numbers between 0 and 1
# =================================================================================================
def construct_df_from_random_matrix(x,factor_array):
"""
This function constructs a DataFrame out of matrix x and factor_array, both of which are assumed to be numpy arrays.
It projects the numbers in the x (which is output of a design-of-experiment build) to the factor array ranges.
Here factor_array is assumed to have only min and max ranges.
Matrix x is assumed to have numbers ranging from 0 to 1 only.
"""
row_num=x.shape[0] # Number of rows in the matrix x
col_num=x.shape[1] # Number of columns in the matrix x
empty=np.zeros((row_num,col_num))
def simple_substitution(idx,factor_list):
alpha=np.abs(factor_list[1]-factor_list[0])
beta=idx
return factor_list[0]+(beta*alpha)
for i in range(row_num):
for j in range(col_num):
empty[i,j] = simple_substitution(x[i,j],factor_array[j])
return pd.DataFrame(data=empty)
# ======================================================================================
# Function for building full factorial DataFrame from a dictionary of process variables
# ======================================================================================
def build_full_fact(factor_level_ranges):
"""
Builds a full factorial design dataframe from a dictionary of factor/level ranges
Example of the process variable dictionary:
{'Pressure':[50,60,70],'Temperature':[290, 320, 350],'Flow rate':[0.9,1.0]}
"""
factor_lvl_count=[]
factor_lists=[]
for key in factor_level_ranges:
factor_lvl_count.append(len(factor_level_ranges[key]))
factor_lists.append(factor_level_ranges[key])
x = fullfact_corrected(factor_lvl_count)
df=construct_df(x,factor_lists)
df.columns=factor_level_ranges.keys()
return df
# ================================================================================================================================================================
# Function for building 2-level fractional factorial DataFrame from a dictionary of process variables and a generator string representing the confounded variable
# ================================================================================================================================================================
def build_frac_fact(factor_level_ranges,gen_string):
"""
Builds a full factorial design dataframe from a dictionary of factor/level ranges.
Only min and max values of the range are required.
Example of the dictionary:
{'Pressure':[50,70],'Temperature':[290, 350],'Flow rate':[0.9,1.0]}
This function requires a little more knowledge of how the confounding will be allowed.
This means that some factor effects get muddled with other interaction effects, so it’s harder to distinguish between them).
Let’s assume that we just can’t afford (for whatever reason) the number of runs in a full-factorial design. We can systematically decide on a fraction of the full-factorial by allowing some of the factor main effects to be confounded with other factor interaction effects.
This is done by defining an alias structure that defines, symbolically, these interactions. These alias structures are written like “C = AB” or “I = ABC”, or “AB = CD”, etc.
These define how one column is related to the others.
EXAMPLE
------------
For example, the alias “C = AB” or “I = ABC” indicate that there are three factors (A, B, and C) and that the main effect of factor C is confounded with the interaction effect of the product AB, and by extension, A is confounded with BC and B is confounded with AC.
A full- factorial design with these three factors results in a design matrix with 8 runs, but we will assume that we can only afford 4 of those runs.
To create this fractional design, we need a matrix with three columns, one for A, B, and C, only now where the levels in the C column is created by the product of the A and B columns.
"""
factor_count=len(factor_level_ranges)
factor_lists=[]
for key in factor_level_ranges:
if len(factor_level_ranges[key])!=2:
factor_level_ranges[key][1]=factor_level_ranges[key][-1]
factor_level_ranges[key]=factor_level_ranges[key][:2]
print(f"{key} had more than two levels. Assigning the end point to the high level.")
if factor_count!=len(gen_string.split(' ')):
print("Length of the generator string for the fractional factorial build does not match the length of the process variables dictionary")
return None
for key in factor_level_ranges:
factor_lists.append(factor_level_ranges[key])
x = fracfact(gen_string)
def index_change(x):
if x==-1:
return 0
else:
return x
vfunc=np.vectorize(index_change)
x=vfunc(x)
df=construct_df(x,factor_lists)
df.columns=factor_level_ranges.keys()
return df
# =====================================================================================
# Function for building Plackett-Burman designs from a dictionary of process variables
# =====================================================================================
def build_plackett_burman(factor_level_ranges):
"""
Builds a Plackett-Burman dataframe from a dictionary of factor/level ranges.
Only min and max values of the range are required.
Example of the dictionary:
{'Pressure':[50,70],'Temperature':[290, 350],'Flow rate':[0.9,1.0]}
Plackett–Burman designs are experimental designs presented in 1946 by Robin L. Plackett and J. P. Burman while working in the British Ministry of Supply.(Their goal was to find experimental designs for investigating the dependence of some measured quantity on a number of independent variables (factors), each taking L levels, in such a way as to minimize the variance of the estimates of these dependencies using a limited number of experiments.
Interactions between the factors were considered negligible. The solution to this problem is to find an experimental design where each combination of levels for any pair of factors appears the same number of times, throughout all the experimental runs (refer to table).
A complete factorial design would satisfy this criterion, but the idea was to find smaller designs.
These designs are unique in that the number of trial conditions (rows) expands by multiples of four (e.g. 4, 8, 12, etc.).
The max number of columns allowed before a design increases the number of rows is always one less than the next higher multiple of four.
"""
for key in factor_level_ranges:
if len(factor_level_ranges[key])!=2:
factor_level_ranges[key][1]=factor_level_ranges[key][-1]
factor_level_ranges[key]=factor_level_ranges[key][:2]
print(f"{key} had more than two levels. Assigning the end point to the high level.")
factor_count=len(factor_level_ranges)
factor_lists=[]
for key in factor_level_ranges:
factor_lists.append(factor_level_ranges[key])
x = pbdesign(factor_count)
def index_change(x):
if x==-1:
return 0
else:
return x
vfunc=np.vectorize(index_change)
x=vfunc(x)
df=construct_df(x,factor_lists)
df.columns=factor_level_ranges.keys()
return df
# ===================================================================================
# Function for building Sukharev Grid designs from a dictionary of process variables
# ===================================================================================
def build_sukharev(factor_level_ranges,num_samples=None):
"""
Builds a Sukharev-grid hypercube design dataframe from a dictionary of factor/level ranges.
Number of samples raised to the power of (1/dimension), where dimension is the number of variables, must be an integer.
Only min and max values of the range are required.
Example of the dictionary:
{'Pressure':[50,70],'Temperature':[290, 350],'Flow rate':[0.9,1.0]}
num_samples: Number of samples to be generated
Special property of this grid is that points are not placed on the boundaries of the hypercube, but at centroids of the subcells constituted by individual samples.
This design offers optimal results for the covering radius regarding distances based on the max-norm.
"""
for key in factor_level_ranges:
if len(factor_level_ranges[key])!=2:
factor_level_ranges[key][1]=factor_level_ranges[key][-1]
factor_level_ranges[key]=factor_level_ranges[key][:2]
print(f"{key} had more than two levels. Assigning the end point to the high level.")
factor_count=len(factor_level_ranges)
factor_lists=[]
for key in factor_level_ranges:
factor_lists.append(factor_level_ranges[key])
check=num_samples**((1/factor_count))
if (check-int(check)>1e-5):
num_samples=(int(check)+1)**(factor_count)
print("\nNumber of samples not adequate to fill a Sukharev grid. Increasing sample size to: ",num_samples)
x = sukharev_grid(num_points=num_samples,dimension=factor_count)
factor_lists=np.array(factor_lists)
df = construct_df_from_random_matrix(x,factor_lists)
df.columns=factor_level_ranges.keys()
return df
# ===================================================================================
# Function for building Box-Behnken designs from a dictionary of process variables
# ===================================================================================
def build_box_behnken(factor_level_ranges,center=1):
"""
Builds a Box-Behnken design dataframe from a dictionary of factor/level ranges.
Note 3 levels of factors are necessary. If not given, the function will automatically create 3 levels by linear mid-section method.
Example of the dictionary:
{'Pressure':[50,60,70],'Temperature':[290, 320, 350],'Flow rate':[0.9,1.0,1.1]}
In statistics, Box–Behnken designs are experimental designs for response surface methodology, devised by George E. P. Box and Donald Behnken in 1960, to achieve the following goals:
* Each factor, or independent variable, is placed at one of three equally spaced values, usually coded as −1, 0, +1. (At least three levels are needed for the following goal.)
* The design should be sufficient to fit a quadratic model, that is, one containing squared terms, products of two factors, linear terms and an intercept.
* The ratio of the number of experimental points to the number of coefficients in the quadratic model should be reasonable (in fact, their designs kept it in the range of 1.5 to 2.6).*estimation variance should more or less depend only on the distance from the centre (this is achieved exactly for the designs with 4 and 7 factors), and should not vary too much inside the smallest (hyper)cube containing the experimental points.
"""
for key in factor_level_ranges:
if len(factor_level_ranges[key])==2:
factor_level_ranges[key].append((factor_level_ranges[key][0]+factor_level_ranges[key][1])/2)
factor_level_ranges[key].sort()
print(f"{key} had only two end points. Creating a mid-point by averaging them")
factor_count=len(factor_level_ranges)
factor_lists=[]
for key in factor_level_ranges:
factor_lists.append(factor_level_ranges[key])
x = bbdesign_corrected(factor_count,center=center)
x=x+1 #Adjusting the index up by 1
df=construct_df(x,factor_lists)
df.columns=factor_level_ranges.keys()
return df
# =====================================================================================================
# Function for building central-composite (Box-Wilson) designs from a dictionary of process variables
# =====================================================================================================
def build_central_composite(factor_level_ranges,center=(2,2),alpha='o',face='ccc'):
"""
Builds a central-composite design dataframe from a dictionary of factor/level ranges.
Only min and max values of the range are required.
Example of the dictionary:
{'Pressure':[50,70],'Temperature':[290, 350],'Flow rate':[0.9,1.0]}
In statistics, a central composite design is an experimental design, useful in response surface methodology, for building a second order (quadratic) model for the response variable without needing to use a complete three-level factorial experiment.
The design consists of three distinct sets of experimental runs:
* A factorial (perhaps fractional) design in the factors studied, each having two levels;
* A set of center points, experimental runs whose values of each factor are the medians of the values used in the factorial portion. This point is often replicated in order to improve the precision of the experiment;
* A set of axial points, experimental runs identical to the centre points except for one factor, which will take on values both below and above the median of the two factorial levels, and typically both outside their range. All factors are varied in this way.
"""
for key in factor_level_ranges:
if len(factor_level_ranges[key])!=2:
factor_level_ranges[key][1]=factor_level_ranges[key][-1]
factor_level_ranges[key]=factor_level_ranges[key][:2]
print(f"{key} had more than two levels. Assigning the end point to the high level.")
# Creates the mid-points by averaging the low and high levels
for key in factor_level_ranges:
if len(factor_level_ranges[key])==2:
factor_level_ranges[key].append((factor_level_ranges[key][0]+factor_level_ranges[key][1])/2)
factor_level_ranges[key].sort()
factor_count=len(factor_level_ranges)
factor_lists=[]
for key in factor_level_ranges:
factor_lists.append(factor_level_ranges[key])
x = ccdesign(factor_count,center=center,alpha=alpha,face=face)
factor_lists=np.array(factor_lists)
df = construct_df_from_matrix(x,factor_lists)
df.columns=factor_level_ranges.keys()
return df
# ====================================================================================
# Function for building simple Latin Hypercube from a dictionary of process variables
# ====================================================================================
def build_lhs(factor_level_ranges, num_samples=None, prob_distribution=None):
"""
Builds a Latin Hypercube design dataframe from a dictionary of factor/level ranges.
Only min and max values of the range are required.
Example of the dictionary:
{'Pressure':[50,70],'Temperature':[290, 350],'Flow rate':[0.9,1.0]}
num_samples: Number of samples to be generated
prob_distribution: Analytical probability distribution to be applied over the randomized sampling.
Takes strings like: 'Normal', 'Poisson', 'Exponential', 'Beta', 'Gamma'
Latin hypercube sampling (LHS) is a form of stratified sampling that can be applied to multiple variables. The method commonly used to reduce the number or runs necessary for a Monte Carlo simulation to achieve a reasonably accurate random distribution. LHS can be incorporated into an existing Monte Carlo model fairly easily, and work with variables following any analytical probability distribution.
"""
for key in factor_level_ranges:
if len(factor_level_ranges[key])!=2:
factor_level_ranges[key][1]=factor_level_ranges[key][-1]
factor_level_ranges[key]=factor_level_ranges[key][:2]
print(f"{key} had more than two levels. Assigning the end point to the high level.")
factor_count=len(factor_level_ranges)
factor_lists=[]
if num_samples==None:
num_samples=factor_count
for key in factor_level_ranges:
factor_lists.append(factor_level_ranges[key])
x = lhs(n=factor_count,samples=num_samples)
factor_lists=np.array(factor_lists)
df = construct_df_from_random_matrix(x,factor_lists)
df.columns=factor_level_ranges.keys()
return df
# ============================================================================================
# Function for building space-filling Latin Hypercube from a dictionary of process variables
# ============================================================================================
def build_space_filling_lhs(factor_level_ranges, num_samples=None):
"""
Builds a space-filling Latin Hypercube design dataframe from a dictionary of factor/level ranges.
Only min and max values of the range are required.
Example of the dictionary:
{'Pressure':[50,70],'Temperature':[290, 350],'Flow rate':[0.9,1.0]}
num_samples: Number of samples to be generated
"""
for key in factor_level_ranges:
if len(factor_level_ranges[key])!=2:
factor_level_ranges[key][1]=factor_level_ranges[key][-1]
factor_level_ranges[key]=factor_level_ranges[key][:2]
print(f"{key} had more than two levels. Assigning the end point to the high level.")
factor_count=len(factor_level_ranges)
factor_lists=[]
if num_samples==None:
num_samples=factor_count
for key in factor_level_ranges:
factor_lists.append(factor_level_ranges[key])
x = transform_spread_out(lhd_matrix(num_points=num_samples,dimension=factor_count)) # create latin hypercube design
factor_lists=np.array(factor_lists)
df = construct_df_from_random_matrix(x,factor_lists)
df.columns=factor_level_ranges.keys()
return df
# =====================================================================================================
# Function for building designs with random _k-means_ clusters from a dictionary of process variables
# =====================================================================================================
def build_random_k_means(factor_level_ranges, num_samples=None):
"""
This function aims to produce a centroidal Voronoi tesselation of the unit random hypercube and generate k-means clusters.
Only min and max values of the range are required.
Example of the dictionary:
{'Pressure':[50,70],'Temperature':[290, 350],'Flow rate':[0.9,1.0]}
num_samples: Number of samples to be generated
"""
for key in factor_level_ranges:
if len(factor_level_ranges[key])!=2:
factor_level_ranges[key][1]=factor_level_ranges[key][-1]
factor_level_ranges[key]=factor_level_ranges[key][:2]
print(f"{key} had more than two levels. Assigning the end point to the high level.")
factor_count=len(factor_level_ranges)
factor_lists=[]
if num_samples==None:
num_samples=factor_count
for key in factor_level_ranges:
factor_lists.append(factor_level_ranges[key])
x = random_k_means(num_points=num_samples,dimension=factor_count) # create latin hypercube design
factor_lists=np.array(factor_lists)
df = construct_df_from_random_matrix(x,factor_lists)
df.columns=factor_level_ranges.keys()
return df
# =============================================================================================
# Function for building maximin reconstruction matrix from a dictionary of process variables
# =============================================================================================
def build_maximin(factor_level_ranges, num_samples=None):
"""
Builds a maximin reconstructed design dataframe from a dictionary of factor/level ranges.
Only min and max values of the range are required.
Example of the dictionary:
{'Pressure':[50,70],'Temperature':[290, 350],'Flow rate':[0.9,1.0]}
num_samples: Number of samples to be generated
This algorithm carries out a user-specified number of iterations to maximize the minimal distance of a point in the set to
* other points in the set,
* existing (fixed) points,
* the boundary of the hypercube.
"""
for key in factor_level_ranges:
if len(factor_level_ranges[key])!=2:
factor_level_ranges[key][1]=factor_level_ranges[key][-1]
factor_level_ranges[key]=factor_level_ranges[key][:2]
print(f"{key} had more than two levels. Assigning the end point to the high level.")
factor_count=len(factor_level_ranges)
factor_lists=[]
if num_samples==None:
num_samples=factor_count
for key in factor_level_ranges:
factor_lists.append(factor_level_ranges[key])
x = maximin_reconstruction(num_points=num_samples,dimension=factor_count) # create latin hypercube design
factor_lists=np.array(factor_lists)
df = construct_df_from_random_matrix(x,factor_lists)
df.columns=factor_level_ranges.keys()
return df
# ========================================================================================
# Function for building Halton matrix based design from a dictionary of process variables
# ========================================================================================
def build_halton(factor_level_ranges, num_samples=None):
"""
Builds a quasirandom dataframe from a dictionary of factor/level ranges using prime numbers as seed.
Only min and max values of the range are required.
Example of the dictionary:
{'Pressure':[50,70],'Temperature':[290, 350],'Flow rate':[0.9,1.0]}
num_samples: Number of samples to be generated
Quasirandom sequence using the default initialization with first n prime numbers equal to the number of factors/variables.
"""
for key in factor_level_ranges:
if len(factor_level_ranges[key])!=2:
factor_level_ranges[key][1]=factor_level_ranges[key][-1]
factor_level_ranges[key]=factor_level_ranges[key][:2]
print(f"{key} had more than two levels. Assigning the end point to the high level.")
factor_count=len(factor_level_ranges)
factor_lists=[]
if num_samples==None:
num_samples=factor_count
for key in factor_level_ranges:
factor_lists.append(factor_level_ranges[key])
x = halton(num_points=num_samples,dimension=factor_count) # create Halton matrix design
factor_lists=np.array(factor_lists)
df = construct_df_from_random_matrix(x,factor_lists)
df.columns=factor_level_ranges.keys()
return df
# ==========================================================================================
# Function for building uniform random design matrix from a dictionary of process variables
# ==========================================================================================
def build_uniform_random (factor_level_ranges, num_samples=None):
"""
Builds a design dataframe with samples drawn from uniform random distribution based on a dictionary of factor/level ranges.
Only min and max values of the range are required.
Example of the dictionary:
{'Pressure':[50,70],'Temperature':[290, 350],'Flow rate':[0.9,1.0]}
num_samples: Number of samples to be generated
"""
for key in factor_level_ranges:
if len(factor_level_ranges[key])!=2:
factor_level_ranges[key][1]=factor_level_ranges[key][-1]
factor_level_ranges[key]=factor_level_ranges[key][:2]
print(f"{key} had more than two levels. Assigning the end point to the high level.")
factor_count=len(factor_level_ranges)
factor_lists=[]
if num_samples==None:
num_samples=factor_count
for key in factor_level_ranges:
factor_lists.append(factor_level_ranges[key])
x = random_uniform(num_points=num_samples,dimension=factor_count) # create Halton matrix design
factor_lists=np.array(factor_lists)
df = construct_df_from_random_matrix(x,factor_lists)
df.columns=factor_level_ranges.keys()
return df