-
Notifications
You must be signed in to change notification settings - Fork 0
/
ADC.R
130 lines (116 loc) · 5.25 KB
/
ADC.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#First we are going to import both datasets
setwd("/Users/berna/Desktop/ADC treball/")
df_happiness <- read.table("/Users/berna/Desktop/ADC treball/world-happiness-report.csv",header = TRUE, sep = ",")
df_P <- read.table("/Users/berna/Desktop/ADC treball/openaq.csv",header = TRUE, sep = ";",fill=TRUE,stringsAsFactors = FALSE)
scalar1 <- function(x) {x / sqrt(sum(x^2))}
#Get the year by date of openaq dataset
df_P$Last_Updated <- substr(df_P$Last_Updated, 1,4)
#Get all data of CO pollution
to_purge=which(df_P$Pollutant == "CO")
df_Pollution=df_P[c(to_purge),]
#Set all values to same unit (µg/m^3)
for(row in 1:nrow(df_Pollution)){
if(df_Pollution$Unit[row] == "ppm"){
df_Pollution$Value[row]= as.character(as.numeric(as.character(df_Pollution$Value[row]))*1145.609)
df_Pollution$Unit[row] = "µg/m³"}
}
#Get the concurrent countries with data
Countries <- unique(df_happiness$ï..country)
Countries2 <- unique(df_Pollution$Country_Label)
Countries_in_Common=Reduce(intersect, list(Countries,Countries2))
#Get data from all concurrent countries
Life_Ladder=c()
CO=c()
for(c in Countries_in_Common){
#For each country, we find what years we have concurrences of both datasets
index_countryH=which(df_happiness$ï..country == c)
index_country1=which(df_Pollution$Country_Label == c)
index_country2=which(df_Pollution$Pollutant == "CO")
index_countryP=Reduce(intersect, list(index_country1,index_country2))
#We reduce the dataset to country and CO
df_per_countryH=df_happiness[c(index_countryH),]
df_per_countryP=df_Pollution[c(index_countryP),]
#We calculate the common years
common_years=Reduce(intersect, list(unique(df_per_countryP[ , 9]),unique(df_per_countryH[ , 2])))
for(year in common_years){
#For each country and year pair, we will find the mean of CO pollution and pair it with the Life Ladder
group1=which(df_per_countryH$year == year)
group2=which(df_per_countryP$Last_Updated == year)
mean_CO=sum(as.numeric(as.vector(df_per_countryP[group2,]$Value)))/length(df_per_countryP[group2,]$Value)
#We will filter out negative pollution values
if(mean_CO>0){
CO=c(CO,mean_CO)
Life_Ladder=c(Life_Ladder,df_per_countryH[group1,]$Life_Ladder)
}
}
}
log_CO=c(log(CO))
plot(Life_Ladder,log_CO,main="The relationship between CO in air and the Life Ladder",xlab="Life Ladder score", ylab="Log CO in the air (µg/m^3)")
data<-cbind(Life_Ladder,log_CO)
fit<-lm(data[,2]~data[,1])
summary(fit)
abline(summary(fit)$coefficients[1], summary(fit)$coefficients[2],col="red")
#We are going to make a permutation test on the F-statistic to ensure the pvalue is under 0.05
b<-anova(fit)
Ftrue<-b$"F value"[1];
nr=10000 #number of rearrangements to be examined
F=numeric(nr);
for (i in 1:nr){
newx<- sample(data, 60)
fit<-lm(newx ~ data[,1])
b<-anova(fit)
F[i]<-b$"F value"[1]}
experimental_pvalue=length(F[F >= Ftrue])/nr
#The current ranking of countries (2020)
index_LL=which(df_happiness$year == "2020")
df_LL_2021=df_happiness[c(index_LL),]
df_2=subset(df_LL_2021, select=c("ï..country", "Life_Ladder"))
x=df_2[order(df_2$Life_Ladder),]
x[order(nrow(x):1),]
#Get CI for CO to get to top 10
x=Life_Ladder
y=log_CO
data=data.frame(Life_Ladder,log_CO)
n<-length(x); data<-cbind(x,y)
theta<-summary(fit1)$coefficients[1]
sdtheta<-summary(fit1)$coefficients[3]
nb<-1000; z<-seq(1,n);tb<-numeric(nb);predb<-numeric(nb)
thetab<-numeric(nb)
for(i in 1:nb){
zb<-sample(z,n,replace=T)
ajustb<- lm(data[zb,2]~ data[zb,1])
thetab[i]<-summary(ajustb)$coefficients[1]
sdthetab<-summary(ajustb)$coefficients[3]
predb[i]<-summary(ajustb)$coefficients[1]+summary(ajustb)$coefficients[2]*7.213
tb[i]<-(thetab[i]-theta)/sdthetab}
hist(predb,main="Histogram of the CO needed to get in the top 10")
Prediction_CO_to_top10Score=quantile(predb,c(0.025,0.975))
#Get the data from spain
CO_s=c()
Life_Ladder_s=c()
index_countryH=which(df_happiness$ï..country == "Spain")
index_country1=which(df_Pollution$Country_Label == "Spain")
index_country2=which(df_Pollution$Pollutant == "CO")
index_countryP=Reduce(intersect, list(index_country1,index_country2))
#We reduce the dataset to Spain and CO
df_per_countryH=df_happiness[c(index_countryH),]
df_per_countryP=df_Pollution[c(index_countryP),]
#We calculate the common years
common_years=Reduce(intersect, list(unique(df_per_countryP[ , 9]),unique(df_per_countryH[ , 2])))
for(year in common_years){
#For each year, we will find the mean of CO pollution and pair it with the Life Ladder
group1=which(df_per_countryH$year == year)
group2=which(df_per_countryP$Last_Updated == year)
mean_CO=sum(as.numeric(as.vector(df_per_countryP[group2,]$Value)))/length(df_per_countryP[group2,]$Value)
#We will filter out negative pollution values
if(mean_CO>0){
CO_s=c(CO_s,mean_CO)
Life_Ladder_s=c(Life_Ladder_s,df_per_countryH[group1,]$Life_Ladder)
}
}
#Log regression fit
log_CO_s=c(log(CO_s))
plot(Life_Ladder_s,log_CO_s,main="The relationship between CO in air and the Life Ladder of Spain",xlab="Life Ladder score", ylab="Log CO in the air(µg/m^3)")
data_s<-cbind(Life_Ladder_s,log_CO_s)
fit_S<-lm(data_s[,2]~data_s[,1])
abline(summary(fit_S)$coefficients[1], summary(fit_S)$coefficients[2],col="red")