-
Notifications
You must be signed in to change notification settings - Fork 0
/
Part2_SASCode.sas
230 lines (201 loc) · 7.18 KB
/
Part2_SASCode.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
/*Project 1: Forecasting Earnings*/
* approximately, it will take 35 seconds to run the whole code;
/*Part 1: Data Preparation*/
*Step 1: import earnings dataset from desktop folder;
LIBNAME ASGMT1 'C:\Users\15758\Desktop\ACX3300\A1';
DATA EARNINGS; SET ASGMT1.EARNINGS; RUN;
*Step 2: check the variables in dataset;
PROC CONTENTS DATA=EARNINGS; RUN;
*Step 3: select a sample of firms whose GVKEY starts with '5';
DATA FIRMS; SET EARNINGS;
IF FIRST(LEFT(GVKEY))='5' THEN OUTPUT FIRMS; RUN;
*Step 4: construct variables;
DATA ONE; SET FIRMS (KEEP= GVKEY FYEAR DATADATE AT CEQ CHE CSHO DLC DLTT INVT IVST MIB NI OANCF OIADP PI PSTK RECT SALE TXT XSGA);
EPS=NI/CSHO;
NOA=SUM(DLTT, DLC, CEQ, PSTK, MIB, -CHE, -IVST);
PM=OIADP/SALE;
ATO=SALE/NOA;
TACC=NI-OANCF;
INV=INVT/NOA;
AR=RECT/NOA;
ETR=TXT/PI;
SGX=XSGA/SALE;
* delete the observation with missing values;
ARRAY M(*) EPS NOA PM ATO TACC AR ETR SGX;
DO I =1 TO HBOUND(M);
IF M(I)=. THEN DELETE;
END;
DROP I;
RUN;
*Step 5: create related accounting variables;
PROC SQL;
CREATE TABLE TWO AS SELECT DISTINCT A.*, B.NI/A.AT AS ROA1, A.NI/C.AT AS ROA, A.NI/C.CEQ AS ROE, A.AT/A.CEQ AS LEV
FROM ONE AS A, ONE AS B, ONE AS C
WHERE A.GVKEY=B.GVKEY=C.GVKEY AND A.FYEAR=(B.FYEAR-1)=(C.FYEAR+1);
QUIT;
*Step 6: using log of variables;
DATA THREE; SET TWO;
LOGROA=LOG(ROA);
LOGROA1=LOG(ROA1);
LOGPM=LOG(PM);
LOGATO=LOG(ATO);
LOGTACC=LOG(TACC);
LOGINV=LOG(INV);
LOGAR=LOG(AR);
LOGETR=LOG(ETR);
LOGSGX=LOG(SGX);
RUN;
*Step 7: use SAS MACRO to winsorize the continuous variables each year at top and bottom 1%;
%MACRO WIN (DATA, SORT, N, VAR);
PROC SORT DATA=&DATA; BY &SORT;
PROC MEANS DATA=&DATA NOPRINT; BY &SORT;
VAR &VAR;
OUTPUT OUT=STAT (DROP=_TYPE_ _FREQ_)
P1=L1-L&N
P99=H1-H&N ;
RUN;
DATA NEW&DATA; MERGE &DATA STAT; BY &SORT;
ARRAY V(*) &VAR;
ARRAY H(*) H1-H&N;
ARRAY L(*) L1-L&N;
DO I=1 TO &N;
IF .<V(I)<L(I) THEN V(I)=L(I);
IF V(I)>H(I) THEN V(I)=H(I);
END;
DROP I H1-H&N L1-L&N;
PROC MEANS DATA=NEW&DATA;
VAR &VAR;
RUN;
%MEND;
* windorization for the sample data;
DATA SAMPLE; SET THREE; RUN;
%WIN (DATA=SAMPLE, SORT=FYEAR, N=22, VAR=EPS NOA PM ATO TACC INV AR ETR SGX ROA1 ROA ROE LEV LOGROA LOGROA1 LOGPM LOGATO LOGTACC LOGINV LOGAR LOGETR LOGSGX);
RUN;
/*Part 2: Descriptive Statistics and Correlation Coefficients*/
*Step 1: descriptive statistics of the variables;
* statistics for EPS;
PROC UNIVARIATE DATA=NEWSAMPLE TRIMMED=0.01;
VAR EPS;
TITLE 'STATISTICS - EPS';
RUN;
* statistics for ROA1;
PROC UNIVARIATE DATA=NEWSAMPLE TRIMMED=0.01;
VAR ROA1;
TITLE 'STATISTICS - ROA1';
RUN;
*Step 2: correlation coefficients between the variables;
PROC CORR DATA=NEWSAMPLE;
VAR EPS ROA1 ROA NOA PM ATO TACC INV AR ETR SGX ROE LEV;
WITH EPS ROA1 ROA;
TITLE 'PEARSON CORRELATION';
RUN;
PROC CORR DATA=NEWSAMPLE SPEARMAN;
VAR EPS ROA1 ROA NOA PM ATO TACC INV AR ETR SGX ROE LEV;
WITH EPS ROA1 ROA;
TITLE 'SPEARMAN CORRELATION';
RUN;
PROC CORR DATA=NEWSAMPLE;
VAR LOGROA1 LOGROA LOGPM LOGATO LOGTACC LOGINV LOGAR LOGETR LOGSGX;
WITH LOGROA1;
TITLE 'PEARSON CORRELATION - LOG';
RUN;
PROC CORR DATA=NEWSAMPLE SPEARMAN;
VAR LOGROA1 LOGROA LOGPM LOGATO LOGTACC LOGINV LOGAR LOGETR LOGSGX;
WITH LOGROA1;
TITLE 'SPEARMAN CORRELATION - LOG';
RUN;
/*Part 3: Simple Models*/
*Step 1: random walk model (1, 2 and 3-year);
* moving average model (2, 3 and 5 years);
* errors forecasting;
PROC SQL;
CREATE TABLE SIMPLE
AS SELECT DISTINCT A.GVKEY, A.FYEAR, A.EPS,
B.EPS AS RW1, ABS((A.EPS-B.EPS)/A.EPS) AS ERROR_RW1,
C.EPS AS RW2, (B.EPS+C.EPS)/2 AS MV2, ABS((A.EPS-C.EPS)/A.EPS) AS ERROR_RW2, ABS((A.EPS-(B.EPS+C.EPS)/2)/A.EPS) AS ERROR_MV2,
D.EPS AS RW3, (B.EPS+C.EPS+D.EPS)/3 AS MV3, ABS((A.EPS-D.EPS)/A.EPS) AS ERROR_RW3, ABS((A.EPS-(B.EPS+C.EPS+D.EPS)/3)/A.EPS) AS ERROR_MV3,
E.EPS AS RW4, (B.EPS+C.EPS+D.EPS+E.EPS)/4 AS MV4, ABS((A.EPS-E.EPS)/A.EPS) AS ERROR_RW4, ABS((A.EPS-(B.EPS+C.EPS+D.EPS+E.EPS)/4)/A.EPS) AS ERROR_MV4,
F.EPS AS RW5, (B.EPS+C.EPS+D.EPS+E.EPS+F.EPS)/5 AS MV5, ABS((A.EPS-F.EPS)/A.EPS) AS ERROR_RW5, ABS((A.EPS-(B.EPS+C.EPS+D.EPS+E.EPS+F.EPS)/5)/A.EPS) AS ERROR_MV5
FROM NEWSAMPLE AS A, NEWSAMPLE AS B, NEWSAMPLE AS C, NEWSAMPLE AS D, NEWSAMPLE AS E, NEWSAMPLE AS F
WHERE A.GVKEY=B.GVKEY=C.GVKEY=D.GVKEY=E.GVKEY=F.GVKEY AND A.FYEAR=B.FYEAR+1=C.FYEAR+2=D.FYEAR+3=E.FYEAR+4=F.FYEAR+5;
QUIT;
PROC PRINT DATA=SIMPLE (OBS=20); TITLE 'SIMPLE MODEL'; RUN;
*Step 2: comparing forecast errors;
PROC MEANS DATA=SIMPLE N MEAN STD MIN P1 P5 P10 Q1 MEDIAN Q3 P90 P95 P99 MAX MAXDEC=2;
VAR ERROR_RW1 ERROR_RW2 ERROR_MV2 ERROR_RW3 ERROR_MV3 ERROR_RW5 ERROR_MV5;
RUN;
*Step 3: histogram distribution of forecast errors by using SAS MACRO;
%MACRO UNI(DATA=, VAR=);
PROC UNIVARIATE DATA=&DATA;
WHERE -10 <= &VAR <=10;
VAR &VAR;
HISTOGRAM &VAR/MIDPOINTS=0 TO 1 BY 1;
RUN;
%MEND;
*random walk model (1, 2 and 3-year) and moving average model (2, 3 and 5 years);
%UNI (DATA=SIMPLE, VAR=ERROR_RW1); RUN;
%UNI (DATA=SIMPLE, VAR=ERROR_RW2); RUN;
%UNI (DATA=SIMPLE, VAR=ERROR_RW3); RUN;
%UNI (DATA=SIMPLE, VAR=ERROR_MV2); RUN;
%UNI (DATA=SIMPLE, VAR=ERROR_MV3); RUN;
%UNI (DATA=SIMPLE, VAR=ERROR_MV5); RUN;
PROC TTEST DATA=SIMPLE H0=0; VAR ERROR_RW1; RUN;
PROC TTEST DATA=SIMPLE H0=0; VAR ERROR_RW2; RUN;
PROC TTEST DATA=SIMPLE H0=0; VAR ERROR_RW3; RUN;
PROC TTEST DATA=SIMPLE H0=0; VAR ERROR_MV2; RUN;
PROC TTEST DATA=SIMPLE H0=0; VAR ERROR_MV3; RUN;
PROC TTEST DATA=SIMPLE H0=0; VAR ERROR_MV5; RUN;
/*Part 4: Complex Models*/
*Step 1: forecast ROA by using OLS regressions;
PROC REG DATA=NEWSAMPLE OUTEST=COMPLEX1;
MODEL ROA1=ROA PM ATO TACC INV AR ETR SGX;
TITLE 'COMPLEX1';
RUN;
*Step 2: stepwise regressions;
PROC REG DATA=NEWSAMPLE OUTEST=COMPLEX2;
MODEL ROA1=ROA PM ATO TACC INV AR ETR SGX / SELECTION=STEPWISE;
TITLE 'COMPLEX2';
RUN;
*Step 3: using only the five variables selected by stepwise;
PROC REG DATA=NEWSAMPLE OUTEST=COMPLEX3;
MODEL ROA1=ROA PM INV AR TACC;
TITLE 'COMPLEX3';
RUN;
*Step 4: log of variables;
PROC REG DATA=NEWSAMPLE OUTEST=COMPLEX4;
MODEL LOGROA1=LOGROA LOGPM LOGATO LOGTACC LOGINV LOGAR LOGETR LOGSGX;
TITLE 'COMPLEX4';
RUN;
/*Part 5: Model Backtesting*/
*Step 1: partition sample into estimation sample and test sample;
%MACRO BACKTEST(N=);
%DO K = 1 %TO &N;
DATA EST TEST; SET NEWSAMPLE;
IF RANUNI(&K)<0.5 THEN OUTPUT EST;
ELSE OUTPUT TEST;
RUN;
*Step 2: estimate the model using the estimation sample;
PROC REG DATA=EST OUTEST=EST1 NOPRINT;
MODEL ROA1=ROA PM ATO TACC INV AR ETR SGX;
RUN;
*Step 3: construct forecast of coefficients and errors for the test sample;
DATA EST2; SET EST1;
RENAME ROA=B_ROA PM=B_PM ATO=B_ATO TACC=B_TACC INV=B_INV AR=B_AR ETR=B_ETR SGX=B_SGX;
DATA TEST1; SET TEST;
IF _N_=1 THEN SET EST2;
P_ROA1=INTERCEPT + B_ROA*ROA + B_PM*PM + B_ATO*ATO + B_TACC*TACC + B_INV*INV + B_AR*AR + B_ETR*ETR + B_SGX*SGX ;
ERROR=ABS(ROA1-P_ROA1);
RUN;
PROC MEANS DATA= TEST1 NOPRINT;
VAR ROA1 ERROR;
OUTPUT OUT=BACK1 MEAN=M_ROA1 M_ERROR MEDIAN=MD_ROA1 MD_ERROR;
RUN;
PROC APPEND BASE=AVERAGE DATA=BACK1; RUN;
%END;
PROC MEANS DATA=AVERAGE N MEAN MEDIAN ;
VAR M_ROA1 M_ERROR MD_ROA1 MD_ERROR;
TITLE 'BACKTESTING';
RUN;
%MEND;
*Step 4: use SAS MACRO to repeat 100 times;
%BACKTEST(N=100); RUN;