-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path04-Validation.Rmd
635 lines (487 loc) · 24.6 KB
/
04-Validation.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
```{r truncated class validation, include=FALSE}
## Returns a function that returns a list with each of its elements defined as functions or vectors of one
## element that describe aspects of a truncated triangular distribution
## (pdf, cdf, inverse cdf, mean, median, mode, upper bound, lower bound, variance).
generate.truncated.triangular <- function(a, b, orig.tri.dist) {
## Original parameters of the triangular distribution
M <- orig.tri.dist$tri.mode
L <- orig.tri.dist$tri.lower
U <- orig.tri.dist$tri.upper
## Throw an error if the new bounds do not fall witin the old bounds of the triangular pdf.
if (a < L | b > U) {
stop("The new bounds of the pdf do not fall within the original range")
}
## Create a function that returns the pdf of the truncated triangular, given some x.
pdf <- function(x) {
if (a < x & x <= b) {
pdf.g <- orig.tri.dist$pdf
cumul.F <- orig.tri.dist$cdf
result <- pdf.g(x) / (cumul.F(b) - cumul.F(a))
return(result)
} else {
result <- 0
return(result)
}
}
## Create a function that returns the cdf of the truncated triangular, given some x.
cdf <- function(x) {
cumul.F <- orig.tri.dist$cdf
if (x <= a) {
result <- 0
} else if (a < x & x <= b) {
result <- (cumul.F(x) - cumul.F(a)) / (cumul.F(b) - cumul.F(a))
} else if (b < x) {
result <- 1
}
return(result)
}
## Create a function that returns the inverse cdf of the truncated triangular, given some probability p.
inverse.cdf <- function(p) {
result <-
orig.tri.dist$inverse.cdf(p * (orig.tri.dist$cdf(b) - orig.tri.dist$cdf(a)) + orig.tri.dist$cdf(a))
return(result)
}
## Create a vector of length 1 that describes the mean of the distribution
trun.tri.mean <- if (a <= b & b < M) {
result.numerator <-
(2 * (-3 * L * ((b ^ 2) / 2 - (a ^ 2) / 2) + b ^ 3 - a ^ 3)) / (3 * (U - L) * (M - L))
result.denominator <-
orig.tri.dist$cdf(b) - orig.tri.dist$cdf(a)
result.numerator / result.denominator
} else if (a < M & b >= M) {
result.numerator <-
(
-M ^ 3 * U - 2 * a ^ 3 * U + 3 * L * a ^ 2 * U + 3 * M * b ^ 2 * U - 3 * L * b ^ 2 * U + L * M ^ 3 + 2 * M * a ^ 3 - 3 * L * M * a ^ 2 - 2 * M * b ^ 3 + 2 * L * b ^ 3
) / (3 * (U - L) * (M - L) * (U - M))
result.denominator <-
orig.tri.dist$cdf(b) - orig.tri.dist$cdf(a)
result.numerator / result.denominator
} else if (M <= a & a <= b) {
result.numerator <-
(3 * b ^ 2 * U - 3 * a ^ 2 * U - 2 * b ^ 3 + 2 * a ^ 3) / (3 * (U - L) * (U - M))
result.denominator <-
orig.tri.dist$cdf(b) - orig.tri.dist$cdf(a)
result.numerator / result.denominator
}
## Create a vector of length 1 that describes the median of the distribution
trun.tri.median <- inverse.cdf(0.5)
## Create a vector of length 1 that describes the mode of the distribution
trun.tri.mode <- if (a <= b & b <= M) {
b
} else if (a <= b & b >= M) {
M
} else if (a >= M & b >= a) {
a
}
## Create a vector of length 1 that describes the upper bound of the distribution's domain
trun.tri.upper <- b
## Create a vector of length 1 that describes the lower bound of the distribution's domain
trun.tri.lower <- a
## Create a vector of length 1 that describes the variance of the distribution
trun.tri.var <- if (a <= b & b < M) {
result.numerator <-
(-4 * L * ((b ^ 3) / 3 - (a ^ 3) / 3) + b ^ 4 - a ^ 4) / (2 * (U - L) * (M - L))
result.denominator <-
orig.tri.dist$cdf(b) - orig.tri.dist$cdf(a)
(result.numerator / result.denominator) - trun.tri.mean ^ 2
} else if (a < M & b >= M) {
result.numerator <-
(
-M ^ 4 * U - 3 * a ^ 4 * U + 4 * L * a ^ 3 * U + 4 * M * b ^ 3 * U - 4 * L * b ^ 3 * U + L * M ^ 4 + 3 * M * a ^ 4 - 4 * L * M * a ^ 3 - 3 * M * b ^ 4 + 3 * L * b ^ 4
) / (6 * (U - L) * (M - L) * (U - M))
result.denominator <-
orig.tri.dist$cdf(b) - orig.tri.dist$cdf(a)
(result.numerator / result.denominator) - trun.tri.mean ^ 2
} else if (M <= a & a <= b) {
result.numerator <-
(4 * (b ^ 3) * U - 4 * (a ^ 3) * U - 2 * b ^ 4 + 3 * a ^ 4) / (6 * (U - L) * (U - M))
result.denominator <-
orig.tri.dist$cdf(b) - orig.tri.dist$cdf(a)
(result.numerator / result.denominator) - trun.tri.mean ^ 2
}
## Build the list and return it. This list contains all major properties of the truncated triangular distribution
return(
list(
pdf = pdf,
cdf = cdf,
inverse.cdf = inverse.cdf,
trun.tri.mean = trun.tri.mean,
trun.tri.median = trun.tri.median,
trun.tri.mode = trun.tri.mode,
trun.tri.upper = trun.tri.upper,
trun.tri.lower = trun.tri.lower,
trun.tri.var = trun.tri.var
)
)
}
```
```{r triangular class validation, include=FALSE}
## Returns a function that returns a list with each of its elements defined as functions or vectors of one
## element that describe aspects of a triangular distribution
## (pdf, cdf, inverse cdf, mean, median, mode, upper bound, lower bound, variance)
generate.triangular <- function(L, U, M) {
## Create a function that returns the pdf of the triangular, given some x.
pdf <- function(x) {
if (x < L) {
result <- 0
} else if (x < M) {
result <- 2 * (x - L) / ((U - L) * (M - L))
} else if (x == M) {
result <- 2 / (U - L)
} else if (x <= U) {
result <- 2 * (U - x) / ((U - L) * (U - M))
} else if (U < x) {
result <- 0
}
return(result)
}
## Create a function that returns the cdf of the triangular, given some x.
cdf <- function(x) {
if (x <= L) {
result <- 0
} else if (x <= M) {
result <- ((x - L) ^ 2) / ((U - L) * (M - L))
} else if (x < U) {
result <- 1 - ((U - x) ^ 2) / ((U - L) * (U - M))
} else if (U <= x) {
result <- 1
}
return(result)
}
## Create a function that returns the inverse cdf of the triangular, given some probability p.
inverse.cdf <- function(p) {
if (p < (M - L) / (U - L)) {
result <- L + sqrt(max(0, (M - L) * (U - L) * p))
} else if (p >= (M - L) / (U - L)) {
result <- U - sqrt(max(0, (U - L) * (U - M) * (1 - p)))
}
return(result)
}
## Create a vector of length 1 that describes the mean of the distribution
tri.mean <- (L + U + M) / 3
## Create a vector of length 1 that describes the median of the distribution
tri.median <- inverse.cdf(0.5)
## Create a vector of length 1 that describes the mode of the distribution
tri.mode <- M
## Create a vector of length 1 that describes the upper bound of the distribution's domain
tri.upper <- U
## Create a vector of length 1 that describes the lower bound of the distribution's domain
tri.lower <- L
## Create a vector of length 1 that describes the variance of the distribution
tri.var <-
((L ^ 2) + (U ^ 2) + (M ^ 2) - L * U - L * M - U * M) / 18
## Build the list and return it. This list contains all major properties of the triangular distribution
return(
list(
pdf = pdf,
cdf = cdf,
inverse.cdf = inverse.cdf,
tri.mean = tri.mean,
tri.median = tri.median,
tri.mode = tri.mode,
tri.upper = tri.upper,
tri.lower = tri.lower,
tri.var = tri.var
)
)
}
```
```{r Distributions Validation, include=FALSE}
## Set a seed so results are replicable
set.seed(10)
## Create a list that contains the major properties of a triangular
## distribution with L = 2, U = 9, and M = 7
my.tri.dist <- generate.triangular(L = 2, U = 9, M = 7)
## Create a list that contains the major properties of a truncated
## triangular distribution between a = 3 and b = 8, based on the
## previous triangular
my.trun.tri.dist <-
generate.truncated.triangular(a = 3,
b = 8,
orig.tri.dist = my.tri.dist)
```
```{r Inverse CDF Validation, include=FALSE}
## 1. Generate a sequence of uniform random numbers on the interval [0, 1].
## 10000 should be enough.
U <- runif(n = 10000)
## 2. Transform all elements of the U sequence with the inverse CDF
## of the truncated distribution
simulated.Z.prime.Inv.CDF <- sapply(U, my.trun.tri.dist$inverse.cdf)
```
```{r Simulated Trial and Error Validation, include=FALSE}
## Simulate 10000 instances of a random variable with f(x) = truncated triangular by trial and error.
simulated.Z.prime.t.and.e <- rep(0, 10000)
success.count <- 0
## Stop once we've achieved 10000 succesful numbers
while (success.count < 10000) {
## Step 2
u.1 <-
runif(
n = 1,
min = my.trun.tri.dist$trun.tri.lower,
max = my.trun.tri.dist$trun.tri.upper
)
## Step 3
u.2 <-
runif(
n = 1,
min = 0,
max = my.trun.tri.dist$pdf(my.trun.tri.dist$trun.tri.mode)
)
## Step 4
if (u.2 <= my.trun.tri.dist$pdf(u.1)) {
# Step 5
success.count <- success.count + 1
simulated.Z.prime.t.and.e[success.count] <- u.1
}
}
```
# Validation
To validate our claims that both vectors create samples that match the truncated triangular distribution, a few hypothesis tests are in order.
First, we validate that the sample variance and the distribution variance match; second, we evaluate whether their mean statistics match. If both these tests are passed, we perform a goodness of fit test. Specifically, a _Kolmogorov-Smirnof_ test or a _Chi-Square test_.
These tests rely on a significance level ($\alpha$) which we have appropriately picked to be 0.1. The grater the significance level, the more certain you are that your results are not a product of chance and the true behavior of your random variable.
Usually when using the more common procedures for testing hypothesis, our data needs to meet a few criteria.
## Hypothesis test for Variance
Specifically, when testing for variance equivalence using a Chi-Square ($\chi^{2}$) test, we should validate that the sample variance follows a $\chi^{2}$ distribution.
This usually isn't the case but to the detriment that we introduce some uncertainty, the condition can be assumed if the distribution of the data roughly ressembles a normal distribution and the sample size is big enough (our sample size is 10000, no need to worry there).
If we didn't know that our data follows a triangular distribution this could be a fairly save assumption, but because that really isn't the case, more advanced techniques like bootstraping should be used while performing this test. Such is left as an excercise to the reader.
Let's study the mathematical procedure.
Since we want to validate that the variance of the sample is the same as that of the distribution (some value $\sigma^{2}$), we make opposing statements about it. We call these statements our _null hypothesis_ and our _alternate hypothesis_. In the end, we should be able to say, for some certainty, whether or not we may reject the _null hypothesis_.
$$
H_{0}:\hat{\sigma} ^{2} = \sigma^{2}
$$
$$
H_{a}:\hat{\sigma}^{2} \neq \sigma^{2}
$$
We then calculate the sample variance:
$$
S^{2}=\sum_{i=1}^{n}\frac{(x_{i}-\bar{X})^{2}}{n-1}
$$
and plug it into the test statistic:
$$
\left(\frac{S}{\sigma}\right)^{2}(n-1)\sim\chi^{2}_{n-1}.
$$
The _alternate hypothesis_ tells us that this is a two-tailed test, meaning we must use both ends of the $\chi^{2}$ distribution. We reject $H_{0}$ if the test statistic falls outside the interval bounded by the $\chi^{2}$ inverse CDF evaluated at $1-\frac{\alpha}{2}$ and at $\frac{\alpha}{2}$ with $n-1$ degrees of freedom. This may seem unorthodox, but the most common notation used to reference these bounds is $\chi^{2}_{1-\frac{\alpha}{2},n-1}$ and $\chi^{2}_{\frac{\alpha}{2},n-1}$.
$$
\chi^{2}_{\frac{\alpha}{2},n-1}\leq\left(\frac{S}{\sigma}\right)^{2}(n-1)\leq \chi^{2}_{1-\frac{\alpha}{2},n-1} \Rightarrow \hat{\sigma}^{2}=\sigma^{2}
$$
We shall now implement this in R. Do consider, there is no exact method to calculate the inverse CDF of the $\chi^{2}$ distribution that we could reasonably implement such as a closed form expression, and so we must use R's native function to find approximate values. Most mathematical software come with their own implementation of this method or have third party libraries to extend their functionality (such as SciPy for Python).
Do remember that we have stored the simulated data for each method in their respective variables: `simulated.Z.prime.Inv.CDF` and `simulated.Z.prime.t.and.e`, and the truncated triangular distribution object in the variable `my.trun.tri.dist`.
```{r Hypothesis Variance inverse cdf, echo=TRUE}
## Inverse CDF method
alpha <- 0.1
degrees.of.freedom <- (length(simulated.Z.prime.Inv.CDF) - 1)
## 1. Sample Variance and true variance
s.squared.Inv.CDF <- sum((mean(simulated.Z.prime.Inv.CDF) - simulated.Z.prime.Inv.CDF) ^ 2) / degrees.of.freedom
sigma.squared <- my.trun.tri.dist$trun.tri.var
## 2. Test Statistic
test.statistic <- s.squared.Inv.CDF * degrees.of.freedom / sigma.squared
## 3. Evaluate if statistic falls within bounds
lower.bound.criteria <- test.statistic >= qchisq(p = alpha / 2, df = degrees.of.freedom)
upper.bound.criteria <- test.statistic <= qchisq(p = 1 - alpha / 2, df = degrees.of.freedom)
## 4. Conclude:
print(paste(
"With a significance level of 0.1, we ",
if (lower.bound.criteria & upper.bound.criteria) {
"may not reject the null hypothesis"
} else {
"may reject the null hypothesis"
},
sep = ""))
```
Effectively, via the inverse CDF method we generate a sample with the same variance as the underlying truncated triangular distribution.
```{r Hypothesis Variance trial error, echo=TRUE}
## Trial and Error
degrees.of.freedom <- (length(simulated.Z.prime.t.and.e) - 1)
## 1. Sample Variance
s.squared.t.and.e <- sum((mean(simulated.Z.prime.t.and.e) - simulated.Z.prime.t.and.e) ^ 2) / degrees.of.freedom
## 2. Test Statistic
test.statistic <- s.squared.t.and.e * degrees.of.freedom / sigma.squared
## 3. Evaluate if statistic falls within bounds
lower.bound.criteria <- test.statistic >= qchisq(p = alpha / 2, df = degrees.of.freedom)
upper.bound.criteria <- test.statistic <= qchisq(p = 1 - alpha / 2, df = degrees.of.freedom)
## 4. Conclude:
print(paste(
"With a significance level of 0.1, we ",
if (lower.bound.criteria & upper.bound.criteria) {
"may not reject the null hypothesis"
} else {
"may reject the null hypothesis"
},
sep = ""))
```
Effectively, via the trial and error method we generate a sample with the same variance as the underlying truncated triangular distribution.
We may move on to test for the mean.
## Hypothesis test for mean
Specifically, when testing for mean equivalence using a t-test, we should validate that the sample mean follows a normal distribution. This does not mean that the data should be normally distributed.
This assumption, because of the central limit theorem, should hold up for fairly large datasets like ours. Roughly speaking, this can be assumed of any dataset with over 50 complete observations.
Let's study the mathematical procedure.
We now want to validate that the mean of the sample is the same as that of the distribution (some value $\mu$), we make opposing statements about it.
$$
H_{0}:\hat{\mu} = \mu
$$
$$
H_{a}:\hat{\mu} \neq \mu
$$
We then calculate the sample mean:
$$
\bar{X}=\frac{\sum_{i=1}^{n}x_{i}}{n}
$$
and plug it into the test statistic:
$$
\frac{\bar{X}-\mu}{\frac{S}{\sqrt{n}}} \sim t_{n-1}.
$$
The _alternate hypothesis_ tells us that this is a two-tailed test, meaning we must use both ends of Student's t distribution. We reject $H_{0}$ if the test statistic falls outside the interval bounded by the t inverse CDF evaluated at $1-\frac{\alpha}{2}$ and at $\frac{\alpha}{2}$ with $n-1$ degrees of freedom. This may seem unorthodox, but the most common notation used to reference these bounds is $t_{1-\frac{\alpha}{2},n-1}$ and $t_{\frac{\alpha}{2},n-1}$.
$$
t_{\frac{\alpha}{2},n-1}\leq\frac{\bar{X}-\mu}{\frac{S}{\sqrt{n}}}\leq t_{1-\frac{\alpha}{2},n-1} \Rightarrow \hat{\mu}=\mu
$$
We shall now implement this in R. Same as with the test for variance, there is no exact method to calculate the inverse CDF of Student's t distribution that we could reasonably implement, and so we must use R's native function to find approximate values.
```{r Hypothesis mean inverse cdf, echo=TRUE}
## Inverse CDF method
## 1. Sample mean and true mean
x.bar.Inv.CDF <- mean(simulated.Z.prime.Inv.CDF)
mu <- my.trun.tri.dist$trun.tri.mean
## 2. Test Statistic
test.statistic <- (x.bar.Inv.CDF - mu) / (sqrt(sigma.squared / length(simulated.Z.prime.Inv.CDF)))
## 3. Evaluate if statistic falls within bounds
lower.bound.criteria <- test.statistic >= qt(p = alpha / 2, df = degrees.of.freedom)
upper.bound.criteria <- test.statistic <= qt(p = 1 - alpha / 2, df = degrees.of.freedom)
## 4. Conclude:
print(paste(
"With a significance level of 0.1, we ",
if (lower.bound.criteria & upper.bound.criteria) {
"may not reject the null hypothesis"
} else {
"may reject the null hypothesis"
},
sep = ""))
```
Effectively, via the inverse CDF method we generate a sample with the same mean as the underlying truncated triangular distribution.
```{r Hypothesis mean trial and error, echo=TRUE}
## Trial and Error
## 1. Sample mean and true mean
x.bar.t.and.e <- mean(simulated.Z.prime.t.and.e)
mu <- my.trun.tri.dist$trun.tri.mean
## 2. Test Statistic
test.statistic <- (x.bar.t.and.e - mu) / (sqrt(sigma.squared / length(simulated.Z.prime.t.and.e)))
## 3. Evaluate if statistic falls within bounds
lower.bound.criteria <- test.statistic >= qt(p = alpha / 2, df = degrees.of.freedom)
upper.bound.criteria <- test.statistic <= qt(p = 1 - alpha / 2, df = degrees.of.freedom)
## 4. Conclude:
print(paste(
"With a significance level of 0.1, we ",
if (lower.bound.criteria & upper.bound.criteria) {
"may not reject the null hypothesis"
} else {
"may reject the null hypothesis"
},
sep = ""))
```
Effectively, via the trial and error method we generate a sample with the same mean as the underlying truncated triangular distribution.
We may move on to the goodness of fit tests.
## Goodness of Fit Tests
Goodness of fit tests, for our purposes, serve as a means of validating paired statements about the distribution of our data. Specifically, our _null hypothesis_ would state that the sampled random variable follows a PDF with its respective estimated parameters while our _alternate hypothesis_ tells us exactly the opposite.
In terms of hypothesis:
$$
H_{0}: \hat{f}(x) = f(x)
$$
$$
H_{a}: \hat{f}(x) \neq f(x) \Leftrightarrow \neg H_{0}.
$$
Multiple tests have been devised for goodness of fit, each with their own strengths. It is up to us to determine the most sound method.
### Chi-Square Test
The $\chi^2$ test is particularly useful when fitting data with a discrete distributions of moderate size, though the procedure can be extended to continuous distributions by _binning_ the domain into intervals or classes of equal probability. For samples as large as ours this approach is perhaps not well suited since it is highly likely that the test will fail for most distributions since we require many bins to split the distribution and the probability of each becomes very narrow. Nonetheless, the procedure is enriching.
1. Estimate the parameters of the distributions you suspect the data shuld follow. We will use the parameters of the distribution we want the sample to resemble instead (Those of the truncated triangular distriburtion).
> Let $X \sim f(x)=Triangular(L,U,M)$ and $Z \sim h(x) = f(z| a < X \leq b)$
where $L = 2$, $U = 9$, $M = 7$, $a = 3$, and $b = 8$.
2. Estimate the amount of classes or bins to categorize the continuous distribution. Often, for large datasets, it is recommended to pick $\frac{\sqrt{n}}{5} \leq k \leq \sqrt{n}$.
$$
20 \leq k \leq 100.
$$
We will take $k$ to be 100.
3. Calculate the probability that every class should have ($p=1/k$) and find the bounds for each class with said probability.
> Let H(x) be the CDF to the random variable Z.
We define the lower and upper bounds of each class $i\in \{ 1,\cdots, k\}$ to be $l_{i}$ and $u_{i}$. A value $z$ is said to belong to class $i$ if $l_{i} \leq z < u_{i}$
$$
l_{i} = H^{-1}(p(i-1)), \ u_{i} = H^{-1}(p*i)
$$
4. Once all classes have had their bounds calculated, we may count how many members of the random sample belong to each class ($o_{i}$, observed frequency), and how many members we expected to beling according to its assigned probability ($e_{i} = p*n$, expected frequency).
5. We ma now calculate the test statistic
$$
\sum_{i=1}^{k}\frac{(o_{i}-e_{i})^{2}}{e_{i}} \sim \chi^{2}_{k-r-1}
$$
Notice how the statistic follows a $\chi^{2}$ distribution with $k-r-1$ degrees of freedom. Where $k$ is the number of classes and $r$ is the amount of parameters of the distribution (5 for the truncated triangular).
6. Once again, consider you level of significance and evaluate the critical value after which the _null hypothesis_ should be rejected. This is a right tail test.
$$
\sum_{i=1}^{k}\frac{(o_{i}-e_{i})^{2}}{e_{i}}\leq \chi^{2}_{1-\alpha,k-r-1} \Rightarrow \hat{h}(z)=h(z)
$$
Let's see how we could go about doing this in R. First, for the inverse CDF method.
```{r Chi square goodness inv cdf, echo=TRUE}
## Inverse CDF sample
## Find the intervals for each class i in {1, ..., k}
set.1.to.k <- 1:100
p <- 1/100
l_i <- sapply(p * (set.1.to.k - 1), my.trun.tri.dist$inverse.cdf)
u_i <- sapply(p * set.1.to.k, my.trun.tri.dist$inverse.cdf)
## Find the observed frequency of every class
e_i <-
sapply(set.1.to.k, function(i) {
sum(simulated.Z.prime.Inv.CDF >= l_i[i]
& simulated.Z.prime.Inv.CDF <= u_i[i]
)
})
## Find the expected frequency
o_i <- p * 10000
## Calculate the test statistic
test.statistic <- sum(((o_i - e_i) ^ 2) / e_i)
## Calculate the Chi Square ritical value
critical.value <- qchisq(1-alpha, df = 100 - 5 - 1)
## Conclude
print(paste(
"With a significance level of 0.1, we ",
if (test.statistic <= critical.value) {
"may not reject the null hypothesis"
} else {
"may reject the null hypothesis"
},
sep = ""))
```
Now for trial and error.
```{r Chi square goodness t and e, echo=TRUE}
## Trial and Error
## Find the observed frequency of every class
e_i <-
sapply(set.1.to.k, function(i) {
sum(simulated.Z.prime.t.and.e >= l_i[i]
& simulated.Z.prime.t.and.e <= u_i[i]
)
})
## Calculate the test statistic
test.statistic <- sum(((o_i - e_i) ^ 2) / e_i)
## Conclude
print(paste(
"With a significance level of 0.1, we ",
if (test.statistic <= critical.value) {
"may not reject the null hypothesis"
} else {
"may reject the null hypothesis"
},
sep = ""))
```
Effectively, it would seem that both our procedures for generating a sample from the truncated triangular distribution were succesful. Let us move on to a more well suited test for continuous distributions.
### Kolmogorov-Smirnov Test
The Komogorov-Smirnov test is more robust for goodness of fit procedures than the $\chi^{2}$ because it allows for samples of any size and it does not require that you split continuous functions into bins or classes, which is undesirable as it introduces a source of error.
The procedure goes as follows:
1. Order the random sample from smalles to largest, without removing duplcates (yields $y_{j}, \forall j\in \{1,\cdots, n \}$).
2. Calculate the test statistic as follows:
$$
D^{+} = \max_{y_{j}, \forall j\in \{1,\cdots, n\}}{\{ j/n - H(y_{j})\}},\ D^{-} = \max_{y_{j}, \forall j\in \{1,\cdots, n\}}{\{ H(y_{j}) - (j-1)/n\}}
$$
$$
D = \max{\{ D^{+}, D^{-}\}}
$$
3. If $D$ is greater than the K-S critical value for $\alpha = 0.1$ and $n=10,000$, reject the _null hypothesis_.
The procedure seems fairly straight forward, except for the part where we calculate the K-S critical value. This time, at least in R, you might require some outside resources such as a K-S table or a library that can simulate the statistic with which to compare. We will not implement this procedure in R since we already have evidence that the samples follow the truncated triangular distribution described in previous chapters.
## Further Analysis
Keep in mind that these tests all have their respective counterparts wherin a comparison is made not between a dataset and a distribution but between datasets, meaning we could also ask ourselves whether or not both methods for simulating a probability density function produce similar samples in terms of variance, mean, and goodness of fit.
Notice also that we could have used a much more restrictive level of significance. In fact, the smallest p-value among all tests could yield us an $\alpha$ for which all tests would still pass and make our reasearch much more significant, but such is considered a bad practice. Remember the term _p-hacking_ for future reference and frown upon anyone who abuses statistics in this fashion.