-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHW5_MKrishnamoorthy.sas
151 lines (130 loc) · 4.57 KB
/
HW5_MKrishnamoorthy.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
* HW 5 SAS Code: Maya Krishnamoorthy;
* Import data and label data;
libname b645 "/home/u59228083/BIOS645";
data b645.pima;
infile "/home/u59228083/BIOS645/Pima_fasting_glucose.txt"
dlm="09"X firstobs=2;
input glucose pregnancies dia_bp skin_fold bmi age;
run;
/*
I'll start with basic tables & plots to explore the data,
looking at bi-variate distributions
*/
proc sgplot data=b645.pima;
title1 "Scatter Plot: Pregnancies vs. plasma glucose concentration";
SCATTER X=pregnancies Y=glucose;
REG X=pregnancies Y=glucose /NOMARKERS;
LOESS X=pregnancies Y=glucose /NOMARKERS;
run;
proc sgplot data=b645.pima;
title1 "Scatter Plot: Diastolic bp vs. plasma glucose concentration";
SCATTER X=dia_bp Y=glucose;
REG X=dia_bp Y=glucose /NOMARKERS;
LOESS X=dia_bp Y=glucose /NOMARKERS;
run;
proc sgplot data=b645.pima;
title1 "Scatter Plot: thickness of skin folds vs. plasma glucose concentration";
SCATTER X=skin_fold Y=glucose;
REG X=skin_fold Y=glucose /NOMARKERS;
LOESS X=skin_fold Y=glucose /NOMARKERS;
run;
proc sgplot data=b645.pima;
title1 "Scatter Plot: BMI vs. plasma glucose concentration";
SCATTER X=bmi Y=glucose;
REG X=bmi Y=glucose /NOMARKERS;
LOESS X=bmi Y=glucose /NOMARKERS;
run;
proc sgplot data=b645.pima;
title1 "Scatter Plot: age vs. plasma glucose concentration";
SCATTER X=age Y=glucose;
REG X=age Y=glucose /NOMARKERS;
LOESS X=age Y=glucose /NOMARKERS;
run;
/*proc corr data=b645.pima plots=matrix(histogram);
var glucose pregnancies dia_bp skin_fold bmi age;
run;*/
/* Looking at the distribution of each variable independently
*/
proc means data=b645.pima N MIN MEDIAN MAX MEAN STD SKEW KURT MAXDEC=3;
title1 "Distribution of Variables in Pima Dataset";
var glucose pregnancies dia_bp skin_fold bmi age;
run;
/*First attempt at multiple regression of all variables against glucose concentration*/
proc reg data=b645.pima plots=(RESIDUALS(SMOOTH));
title1 "Multiple Regression of Glucose Concentration Levels";
model glucose=pregnancies dia_bp skin_fold bmi age;
run;
/*outliers in the residuals plot, so let's take a closer look at residuals*/
PROC REG DATA=b645.pima NOPRINT;
MODEL glucose=pregnancies dia_bp skin_fold bmi age;
OUTPUT OUT=pima_resids RESIDUAL=resids;
RUN;
QUIT;
PROC UNIVARIATE DATA=pima_resids;
VAR resids;
QQPLOT resids; * added a larger qq-plot for good measure;
RUN;
/*keep track of residuals outliers in dataset pima_resids2*/
DATA pima_resids2;
SET pima_resids;
id = _N_; * this adds a row number I can use;
IF id = 8 OR id = 101 OR id=179 OR id=491 THEN
OUTPUT; * i.e., I'm dropping all other rows;
RUN;
PROC PRINT DATA=pima_resids2;
RUN;
/*Create a dataset to plot these points distinctly*/
DATA pima_resids3;
SET pima_resids2;
outliers = glucose; * I'm calling height 'outliers' so;
npregnancies = pregnancies; * my eventual plot will be labeled;
ndia_bp = dia_bp; * in a way that looks nicer;
nskin_fold = skin_fold;
nbmi = bmi;
nage = age;
DROP glucose pregnancies dia_bp skin_fold bmi age resids id;
RUN;
DATA pima_resids4;
* stacking these datasets;
SET pima_resids pima_resids3;
RUN;
* we can now plot the data and highlight the potential outliers;
PROC SGPLOT DATA=pima_resids4;
SCATTER X=pregnancies Y=glucose;
REG X=pregnancies Y=glucose /NOMARKERS;
SCATTER X=npregnancies Y=outliers /MARKERATTRS=(SYMBOL=CircleFilled
SIZE=10);
RUN;
PROC SGPLOT DATA=pima_resids4;
SCATTER X=dia_bp Y=glucose;
REG X=dia_bp Y=glucose /NOMARKERS;
SCATTER X=ndia_bp Y=outliers /MARKERATTRS=(SYMBOL=CircleFilled
SIZE=10);
RUN;
PROC SGPLOT DATA=pima_resids4;
SCATTER X=skin_fold Y=glucose;
REG X=skin_fold Y=glucose /NOMARKERS;
SCATTER X=nskin_fold Y=outliers /MARKERATTRS=(SYMBOL=CircleFilled
SIZE=10);
RUN;
PROC SGPLOT DATA=pima_resids4;
SCATTER X=bmi Y=glucose;
REG X=bmi Y=glucose /NOMARKERS;
SCATTER X=nbmi Y=outliers /MARKERATTRS=(SYMBOL=CircleFilled
SIZE=10);
RUN;
PROC SGPLOT DATA=pima_resids4;
SCATTER X=age Y=glucose;
REG X=age Y=glucose /NOMARKERS;
SCATTER X=nage Y=outliers /MARKERATTRS=(SYMBOL=CircleFilled
SIZE=10);
RUN;
/*
As a sensitivity analysis, we can refit the model with these
four deleted.
*/
PROC REG DATA=pima_resids PLOTS=(RESIDUALS(SMOOTH));
MODEL glucose=pregnancies dia_bp skin_fold bmi age /clb;
WHERE resids BETWEEN -70 AND 77;
RUN;
QUIT;