-
Notifications
You must be signed in to change notification settings - Fork 0
/
Project_1_try.sas
354 lines (301 loc) · 12.2 KB
/
Project_1_try.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
/*
* Project 1: Survival Analysis
* Author: Swati Arora
*/
libname Project'/folders/myfolders/Project1';
proc import datafile= '/folders/myfolders/Project1/FermaLogis1.csv' out= Project.test DBMS=csv;
run;
proc print data= Project.Ferma;
run;
/*
* Setting the categorical variables to numeric.
* And Creating new dataset.
*/
data Project.Ferma1;
set Project.Ferma;
if Attrition = 'Yes'
then AttritionN= 1;
else AttritionN= 0;
If Gender= 'Female' Then Gender_num= 1;
Else Gender_num =0;
If Maritalstatus = 'Single' Then mar_Status = 0;
Else if Maritalstatus = 'Married' Then mar_Status = 1;
Else if Maritalstatus = 'Divorced' Then mar_Status = 2;
IF YearsAtCompany > 40 and AttritionN = 0 THEN YearsAtCompany = 41;
IF Age < 30 Then Experience ='Fresher';
Else IF Age < 45 and Age > 30 Then Experience ='Moderate';
Else Experience = 'Highly experienced';
run;
proc print data= Project.Ferma1;
run;
Data Project.Ferma1;
Merge Project.Ferma1 project.test;
By employeenumber;
RUN;
/*
* Goodness of fit test on all the variables excluding Bonus
*/
PROC LIFEREG DATA= Project.Ferma1;
class BusinessTravel Department EducationField JobRole OverTime ;
Model YearsAtCompany * AttritionN(0)= Age DailyRate DistanceFromHome
EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel
JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked
PercentSalaryHike PerformanceRating RelationshipSatisfaction
StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager Education Gender_num
Mar_Status
/ DISTRIBUTION=LNORMAL; /*Run the code for all distributions*/
PROBPLOT;
RUN;
PROC LIFEREG DATA= Project.Ferma1;
class BusinessTravel Department EducationField JobRole OverTime Bonus_1 - Bonus_40;
Model YearsAtCompany * AttritionN(0)= Age DailyRate DistanceFromHome
EnvironmentSatisfaction JobInvolvement JobLevel
JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked
PercentSalaryHike PerformanceRating RelationshipSatisfaction
StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager Education Gender_num
Mar_Status Bonus_1 - Bonus_40 BusinessTravel Department EducationField JobRole OverTime
/ DISTRIBUTION=LNORMAL; /*Run the code for all distributions*/
PROBPLOT;
RUN;
PROC LIFEREG DATA= Project.Ferma1;
class BusinessTravel Department EducationField JobRole OverTime ;
Model YearsAtCompany * AttritionN(0)= Age DailyRate DistanceFromHome
EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel
JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked
PercentSalaryHike PerformanceRating RelationshipSatisfaction
StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager Education Gender_num
Mar_Status
/ DISTRIBUTION=LLOGISTIC; /*Run the code for all distributions*/
PROBPLOT;
RUN;
PROC LIFEREG DATA= Project.Ferma1;
class BusinessTravel Department EducationField JobRole OverTime ;
Model YearsAtCompany * AttritionN(0)= Age DailyRate DistanceFromHome
EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel
JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked
PercentSalaryHike PerformanceRating RelationshipSatisfaction
StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager Education Gender_num
Mar_Status
/ DISTRIBUTION=WEIBULL; /*Run the code for all distributions*/
PROBPLOT;
RUN;
PROC LIFEREG DATA= Project.Ferma1;
class BusinessTravel Department EducationField JobRole OverTime ;
Model YearsAtCompany * AttritionN(0)= Age DailyRate DistanceFromHome
EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel
JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked
PercentSalaryHike PerformanceRating RelationshipSatisfaction
StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager Education Gender_num
Mar_Status
/ DISTRIBUTION=GAMMA; /*Run the code for all distributions*/
PROBPLOT;
RUN;
/*
* Compare Models
*/
DATA CompareModels;
L_exponential = -578.0670523;
L_weibull = -484.3183826;
L_lognormal = -480.4612996;
L_gamma = -414.3730069;
LRTEG = -2*(L_exponential - L_gamma);
LRTEW = -2*(L_exponential - L_weibull);
LRTWG = -2*(L_weibull - L_gamma);
LRTLG = -2*(L_lognormal - L_gamma);
p_valueEG = 1 - probchi(LRTEG,2);
p_valueEW = 1 - probchi(LRTEW,1);
p_valueWG = 1 - probchi(LRTWG,1);
p_valueLG = 1 - probchi(LRTLG,1);
RUN;
PROC PRINT DATA=CompareModels;
RUN;
/*
* After Comapring the models it is clear that LNormal is the best fit for our dataset.
*/
/*
* Imputing the covariates (using TEST in LIFETEST) to have an idea of which variable is significant and which are
* non-significant.
*/
PROC LIFETEST DATA=project.ferma1 METHOD=LIFE INTERVALS=5 10 15 20 25 30 35 40 41 PLOTS=(S,H);;
TIME YearsAtCompany * AttritionN(0);
TEST DailyRate DistanceFromHome
EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel
JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked
PercentSalaryHike PerformanceRating RelationshipSatisfaction
StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager Education Gender_num
Mar_Status ;
RUN;
PROC CO
/*
* Imputing Further using LIFEREG
*/
proc lifereg data= Project.Ferma1;
class BusinessTravel Department Education EducationField JobRole OverTime ;
Model YearsAtCompany * AttritionN(0)= Age DailyRate DistanceFromHome
EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel
JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked
PercentSalaryHike PerformanceRating RelationshipSatisfaction
StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager mar_status gender_num
/D=LNORMAL;
run;
/*
* Removing insignificant ones: DailyRate, Employeecount, EmployeeNumber, HourlyRate, ,
* , MonthlyRate, PresentSalaryHike, PerformanceRating, RelationshipSatisfaction,
* Standard Hours, StockOptionLevel, WorkLifeBalance, Gender , JobRole, Education,
* EducationField, JobLevel, StockOptionLevel
*/
proc lifereg data= Project.Ferma1;
class BusinessTravel Department OverTime ;
Model YearsAtCompany * AttritionN(0)= Age DistanceFromHome EnvironmentSatisfaction
JobInvolvement JobSatisfaction NumCompaniesWorked TotalWorkingYears MonthlyIncome
TrainingTimesLastYear YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
mar_status WorkLifeBalance BusinessTravel Department bonus_total
OverTime
/D=LNORMAL CLF;
PROBPLOT;
run;
proc lifereg data= Project.Ferma1;
class BusinessTravel Department OverTime ;
Model YearsAtCompany * AttritionN(0)= Age DistanceFromHome EnvironmentSatisfaction
JobInvolvement JobSatisfaction NumCompaniesWorked TotalWorkingYears MonthlyIncome
TrainingTimesLastYear YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
mar_status StockOptionLevel WorkLifeBalance BusinessTravel Department bonus_total
OverTime
/D=Weibull ;
PROBPLOT;
run;
/*
* Visualization
*/
PROC SGPLOT data= project.ferma1;
VBOX Age/ Category= AttritionN;
label Age = 'Age' AttritionN = 'Attrition';
title ' Distribution of Attrition by Age';
run;
Proc SGPLOT data= Project.Ferma1;
VBAR AttritionN / GROUP= Overtime;
label Overtime = 'Overtime' AttritionN = 'Attrition';
title ' Distribution of Attrition by Overtime';
run;
/*
* Attrition Comparision using Strata
*/
PROC LIFETEST DATA=PROJECT.FERMA1 method=life plots=(S,H);
TIME YearsAtCompany*AttritionN(0);
STRATA Overtime businessTravel / ADJUST=TUKEY;
RUN;
PROC LIFETEST DATA=PROJECT.FERMA1 method=life plots=(S,H);
TIME YearsAtCompany*AttritionN(0);
STRATA Overtime MaritalStatus / ADJUST=TUKEY;
RUN;
PROC LIFETEST DATA=PROJECT.FERMA1 method=life plots=(S,H);
TIME YearsAtCompany*AttritionN(0);
STRATA Overtime JobSatisfaction / ADJUST=TUKEY;
RUN;
PROC LIFETEST DATA=PROJECT.FERMA1 method=life plots=(S,H);
TIME YearsAtCompany*AttritionN(0);
STRATA JobSatisfaction ;
RUN;
PROC LIFETEST DATA=PROJECT.FERMA1 method=life plots=(S,H);
TIME YearsAtCompany*AttritionN(0);
STRATA WOrkLifeBalance ;
RUN;
PROC LIFETEST DATA=PROJECT.FERMA1 method=life plots=(S,H);
TIME YearsAtCompany*AttritionN(0);
STRATA Joblevel ;
RUN;
PROC LIFETEST DATA=PROJECT.FERMA1 method=life plots=(S,H);
TIME YearsAtCompany*AttritionN(0);
STRATA Department ;
RUN;
PROC LIFETEST DATA=PROJECT.FERMA1 method=life plots=(S,H);
TIME YearsAtCompany*AttritionN(0);
STRATA Department ;
RUN;
PROC LIFETEST DATA=PROJECT.FERMA1 method=life plots=(S,H);
TIME YearsAtCompany*AttritionN(0);
STRATA Experience;
RUN;
PROC LIFETEST DATA=PROJECT.FERMA1 method=life plots=(S,H);
TIME YearsAtCompany*AttritionN(0);
STRATA Overtime ;
RUN;
PROC LIFETEST DATA=PROJECT.FERMA1 method=life plots=(S,H);
TIME YearsAtCompany*AttritionN(0);
STRATA EnvironmentSatisfaction ;
RUN;
PROC LIFETEST DATA=PROJECT.FERMA1 method=life plots=(S,H);
TIME YearsAtCompany*AttritionN(0);
STRATA bonus_total ;
RUN;
PROC SGPLOT data= project.ferma1;
VBOX MonthlyIncome/ Category= AttritionN group=Overtime;
label Age = 'Age' AttritionN = 'Attrition' ;
title ' Distribution of Attrition by Income and Overtime';
run;
PROC SGPLOT data= project.ferma1;
VBOX MonthlyIncome/ Category= AttritionN group=Department;
label Age = 'Age' AttritionN = 'Attrition' ;
title ' Distribution of Attrition by Income and Overtime';
run;
ODS GRAPHICS ON;
PROC FREQ DATA = project.ferma1 (where=(yearsatcompany <5));
TABLES Attrition *Trainingtimeslastyear/ PLOTS=FREQPLOT(TWOWAY=GROUPVERTICAL);
*TABLES BusType * OnTimeOrLate / PLOTS=FREQPLOT(TWOWAY=GROUPHORIZONTAL);
*TITLE;
RUN;
ODS GRAPHICS ON;
PROC FREQ DATA = project.ferma1 (where=(yearsatcompany >=5 and department ='Research & Development'));
TABLES Attrition *Trainingtimeslastyear/ PLOTS=FREQPLOT(TWOWAY=GROUPVERTICAL);
*TABLES BusType * OnTimeOrLate / PLOTS=FREQPLOT(TWOWAY=GROUPHORIZONTAL);
*TITLE;
RUN;
PROC FREQ DATA = project.ferma1 (where=(yearsatcompany >=5 and department ='Sales'));
TABLES Attrition *Trainingtimeslastyear/ PLOTS=FREQPLOT(TWOWAY=GROUPVERTICAL);
*TABLES BusType * OnTimeOrLate / PLOTS=FREQPLOT(TWOWAY=GROUPHORIZONTAL);
*TITLE;
RUN;
PROC FREQ DATA = project.ferma1 (where=(yearsatcompany >=5 and department ='Human Resources'));
TABLES Attrition *Trainingtimeslastyear/ PLOTS=FREQPLOT(TWOWAY=GROUPVERTICAL);
*TABLES BusType * OnTimeOrLate / PLOTS=FREQPLOT(TWOWAY=GROUPHORIZONTAL);
*TITLE;
RUN;
PROC SGPLOT data= project.ferma1;
VBOX Distancefromhome/ Category= AttritionN Group=Overtime;
label Age = 'Age' AttritionN = 'Attrition' ;
title ' Distribution of Attrition by Income and Overtime';
run;
PROC SGPLOT data= project.ferma1;
VBOX TotalworkingYears/ Category= AttritionN ;
label Age = 'Age' AttritionN = 'Attrition' ;
title ' Distribution of Attrition by Income and Overtime';
run;
PROC SGPLOT data= project.ferma1;
VBOX Bonus_total/ Category= AttritionN ;
label Age = 'Age' AttritionN = 'Attrition' ;
title ' Distribution of Attrition by Income and Overtime';
run;
/*
* Transpose Bonus variables
*/
proc transpose data=project.ferma1 out= project.Bonus (rename=( _name_ =Type col1=bonus));
Var bonus_1 - bonus_40;
by employeenumber;
RUN;
Data project.formatBonus;
Set project.Bonus;
Format bonus_updated;
IF Bonus ='NA' Then bonus_updated =0;
Else if bonus ='0' Then bonus_updated = 0;
Else bonus_updated =1;
RUN;
Proc freq data=project.formatBonus(where= (bonus_updated=1));
by employeenumber ;
RUN;