-
Notifications
You must be signed in to change notification settings - Fork 5
/
6_Presentation_K_final-1.rmd
410 lines (305 loc) · 17.8 KB
/
6_Presentation_K_final-1.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
---
title: "Advertising Response Measurement"
author: "Team K"
date: "4/25/2019"
output:
slidy_presentation:
font_adjustment: -2
---
```{r setup, include=FALSE}
library(knitr)
opts_chunk$set(echo = TRUE, eval = TRUE, comment="", warning = FALSE, message = FALSE, tidy.opts=list(width.cutoff=55), tidy = TRUE)
set.seed(123) # ensures repeatable results for attribution rules
options(scipen=999) # suppress scientific notation
```
```{r libraries, echo=FALSE}
library(prettydoc)
library(data.table)
library(Hmisc)
library(scales)
library(DT)
library(lubridate)
```
```{r my_functions, echo=FALSE}
fit.model <- function(dt, outcome.name, input.names, model.type, digits = 3){
the.formula <- reduce.formula(dt = dt, outcome.name = outcome.name, input.names = input.names)
if(model.type == "logistic"){
mod <- glm(formula = the.formula, family = "binomial", data = dt)
mod.summary <- logistic.regression.summary(glm.mod = mod, digits = digits)
}
if(model.type == "linear"){
mod <- lm(formula = the.formula, data = dt)
mod.summary <- linear.regression.summary(lm.mod = mod, digits = digits)
}
mod.summary.rounded <- mod.summary[, lapply(X = .SD, FUN = "round.numerics", digits = digits)]
return(mod.summary.rounded)
}
logistic.regression.summary <- function(glm.mod, digits = 3){
library(data.table)
glm.coefs <- as.data.table(summary(glm.mod)$coefficients, keep.rownames = TRUE)
alpha = 0.05
z <- qnorm(p = 1-alpha/2, mean = 0, sd = 1)
glm.coefs[, Odds.Ratio := exp(Estimate)]
glm.coefs[, OR.Lower.95 := exp(Estimate - z * `Std. Error`)]
glm.coefs[, OR.Upper.95 := exp(Estimate + z * `Std. Error`)]
return(glm.coefs[])
}
linear.regression.summary <- function(lm.mod, digits = 3, alpha = 0.05) {
lm.coefs <- as.data.table(summary(lm.mod)$coefficients,keep.rownames = TRUE)
setnames(x = lm.coefs, old = "rn", new = "Variable")
z <- qnorm(p = 1 - alpha/2, mean = 0, sd = 1)
lm.coefs[, Coef.Lower.95 := Estimate - z * `Std. Error`]
lm.coefs[, Coef.Upper.95 := Estimate + z * `Std. Error`]
return(lm.coefs)
}
create.formula <- function(outcome.name, input.names, input.patterns = NA, all.data.names = NA, return.as = "character"){
variable.names.from.patterns <- c()
if(!is.na(input.patterns[1]) & !is.na(all.data.names[1])){
pattern <- paste(input.patterns, collapse = "|")
variable.names.from.patterns <- all.data.names[grep(pattern = pattern, x = all.data.names)]
}
all.input.names <- unique(c(input.names, variable.names.from.patterns))
all.input.names <- all.input.names[all.input.names != outcome.name]
if(!is.na(all.data.names[1])){
all.input.names <- all.input.names[all.input.names %in% all.data.names]
}
input.names.delineated <- sprintf("`%s`", all.input.names)
the.formula <- sprintf("`%s` ~ %s", outcome.name, paste(input.names.delineated, collapse = "+"))
if(return.as == "formula"){
return(as.formula(the.formula))
}
if(return.as != "formula"){
return(the.formula)
}
}
reduce.formula <- function(dt, outcome.name, input.names, input.patterns = NA, max.input.categories = 20, max.outcome.categories.to.search = 4, return.as = "formula"){
require(data.table)
dt <- setDT(dt)
if(!(outcome.name %in% names(dt))){
return("Error: outcome.name is not in names(dt).")
}
pattern.names <- list()
if(!is.na(input.patterns[1])){
for(i in 1:length(input.patterns)){
pattern.names[[i]] <- names(dt)[grep(pattern = input.patterns[i], x = names(dt))]
}
}
all.input.names <- c(input.names, as.character(pattern.names))
num.outcome.categories <- dt[!is.na(get(outcome.name)), length(unique(get(outcome.name)))]
if(num.outcome.categories <= max.outcome.categories.to.search){
num.unique.tab <- dt[, lapply(X = .SD, FUN = function(x){return(length(unique(x[!is.na(x)])))}), .SDcols = input.names, by = outcome.name]
min.categories.tab <- num.unique.tab[, lapply(X = .SD, FUN = "min"), .SDcols = input.names]
reduced.inputs <- names(min.categories.tab)[min.categories.tab >= 2]
}
if(num.outcome.categories > max.outcome.categories.to.search){
reduced.inputs <- all.input.names
}
the.formula <- create.formula(outcome.name = outcome.name, input.names = reduced.inputs, all.data.names = names(dt), input.patterns = NA, return.as = return.as)
return(the.formula)
}
round.numerics <- function(x, digits = 0, nearest = 1){
if(is.numeric(x)){
return(nearest * round(x = x/nearest, digits = digits))
}
else{
return(x)
}
}
```
```{r read_data_intro, echo=FALSE, eval=TRUE, results='hide'}
customer <- fread(input = "../Data/customer.csv")
impressions <- fread(input = "../Data/impressions.csv")
transactions <- fread(input = "../Data/transactions.csv")
num.lines <- 20
```
```{r preprocessing_customer, echo=FALSE, comment=""}
customer[,past.purchase := as.factor(past.purchase)]
customer[,email := as.factor(email)]
customer[,direct := as.factor(direct)]
```
```{r preprocessing_impressions, echo=FALSE, comment=""}
impressions[,date := as.Date(date)]
impressions[,channel := as.factor(channel)]
impressions[,click := as.factor(click)]
```
```{r preprocessing_transactions, echo=FALSE, comment=""}
transactions[,V1 := NULL]
transactions[,date := as.Date(date)]
transactions[,last.touch := as.factor(last.touch)]
transactions[,last.click := as.factor(last.click)]
```
## Introduction
- <font size="5">The goal of any marketing campaign is to increase sales, either in short-term or long-term, and each campaign or marketing channel should be evaluated based on the incremental profit, which is the additional sales we produce with advertising over what we would have sold without advertising, relative to its cost.</font>
- <font size="5">With the rapid development of digital media, we are able to track individual users’ behaviors, and it is beneficial for us to estimate **incremental sales** due to advertising in many different channels.</font>
- <font size="5">By using last-click attribution, experiments (holdout testing), marketing mix models and model-based attribution analysis, the main goal of this project is to **evaluate the effectiveness of different advertising channels**.</font>
## Description of the Dataset
- The whole raw digital advertising dataset describes 10,000 customers as well as potential customers of a retailer, and the retailer uses four different advertising channels - display ads, social media ads, email ads, and direct mail ads. This dataset is a synthetic one simulated by the Elea McDonnell Feit, Marketing Professor of Drexel University, and is organized from three perspectives: customer, impressions, and transactions:
- <span style="color:dodgerblue">**customer.csv**</span>: each row in the file represents a customer, 10,000 rows
```{r customer}
datatable(data = customer[1:num.lines,])
```
## Description of the Dataset
- <span style="color:dodgerblue">**Impressions.csv**</span>: each row is an exposure of marketing communication to a customer, 501,336 rows
```{r impressions}
datatable(data = impressions[1:num.lines,])
```
## Description of the Dataset
- <span style="color:dodgerblue">**Transactions.csv**</span>: each row is a transaction made by a customer
```{r transactions}
datatable(data = transactions[1:num.lines,])
```
## Method 1: Last-touch analysis
- <span style="color:dodgerblue">**Analysis**</span>: Based on last-touch attribution to find the last ad the user clicked on prior to the conversion so that we can get the sales attributed to each channel
- <span style="color:dodgerblue">**Implementation A**</span>:
```{r attribution.rules.table, echo=FALSE, comment=""}
last.touch.tab <- xtabs(~last.touch, data = transactions)
last.touch.tab
```
```{r attribution.rules.barplot, echo=FALSE, comment=""}
tab <- transactions[,.N,by = last.touch]
barplot <- barplot(height = tab[,N], space=0.01, las = 1, main = "Last Touch Attribution", ylab = "Transactions", xlab = "channel", ylim = c(0, 1.2*max(tab[,N], na.rm = TRUE)), col = "dodgerblue")
text(barplot, par("usr")[3], labels = tab[,last.touch], srt = 45, adj = c(1.1,1.1), xpd = TRUE)
space_val = 0
text(x = -0.4 + 1:length(tab[,last.touch]) * (1+space_val), y = tab[,N], labels = tab[,N], pos = 3)
```
- <span style="color:dodgerblue">**Implementation B**</span>: Shiny reporting engine: last touch analysis result for subgroups of transactions.
- <span style="color:dodgerblue">**Results & Interpretation**</span>: majority of conversions happen after receiving the ads through email, social media, and diplay.
## Method 2: Holdout Test
- <span style="color:dodgerblue">**Analysis**</span>: Conduct an experiment called holdout testing by randomly selecting the customers for the control group to be not exposed to an ad. Here, we picked 2017-01-31 to analyze the result of the holdout test for the first 10 days.
- <span style="color:dodgerblue">**Implementation A**</span>:
```{r holdout_test ttable, echo=FALSE, comment=""}
test.date <- as.Date("2017-01-31")
test.id <- impressions[date == test.date & channel == "email",.(id = unique(id))]
test.id[,group := "test group"]
hold.out.id <- impressions[date == test.date & channel == "email.holdout",.(id = unique(id))]
hold.out.id[,group := "control group"]
total.tab <- rbind(test.id,hold.out.id)
total.tab[,group := as.factor(group)]
duration <- 10
trans.id <- transactions[date>=test.date & date<(test.date+duration),id]
total.tab[,consumed := id %in% trans.id]
ttable <- xtabs(~group + consumed, data = total.tab)
ttable
mosaicplot(~group + consumed, data = total.tab,color = TRUE,
main = paste("Holdout test on", test.date))
```
- <span style="color:dodgerblue">**Results & Interpretation**</span>: Seen from the table as well as the plot, it is obvious that the proportion of people made actual consumption in test group who have received the email ads is **higher** than that in control group who didn't receive any email ads.
## Method 2: Holdout Test
```{r holdout_test proptest, echo=FALSE, comment=""}
proptest <- prop.test(x = ttable[,"TRUE"], n = xtabs(~group, data = total.tab))
proptest
diff.conv <- c(diff = (proptest$estimate[2]-proptest$estimate[1]), ci = -proptest$conf.int)
diff.conv
```
- <span style="color:dodgerblue">**Results & Interpretation**</span>: the test group had a **45.89%** conversion rate in the first 10 days after the email was sent, while the hold out group had a **40.1%** conversion rate. The incremental increase in conversion rate is between **+2.60%** and **+8.98%** (95% confidence interval).
- <span style="color:dodgerblue">**Implementation B**</span>: Shiny reporting engine: the ad response over time for different email holdout test and for different duration
## Method 3: Marketing Mix Model (MMM)
```{r Data.Prepration, echo=FALSE}
# Data Preparation: summarize impressions and transactions by date
trans.by.day <- xtabs(~date, transactions)
data.chanel <- xtabs(~date+channel, impressions)
mdata <- data.frame(cbind(Sales=trans.by.day[1:57], data.chanel[2:58,]))
# day of week
mdata$dayofweek <- weekdays(as.Date(rownames(mdata)))
# ad stock variable
mdata$Email.ad.effect <- as.numeric(filter(x=mdata$email, filter=0.5, method="recursive"))
mdata$Display.ad.effect <- as.numeric(filter(x=mdata$display, filter=0.3, method="recursive"))
mdata$Direct.ad.effect <- as.numeric(filter(x=mdata$direct, filter=0.75, method="recursive"))
mdata$Social.ad.effect <- as.numeric(filter(x=mdata$social, filter=0.3, method="recursive"))
# Interaction
mdata$inter<-(mdata$Email.ad.effect)*(mdata$Social.ad.effect)
# Beautiful
colnames(mdata)<-c("Sales","Direct","Display","Email","Email Holdout","Social",
"Day of Week",
"Email ad effect","Display ad effect","Direct ad effect","Social ad effect",
"Interaction")
```
- <span style="color:dodgerblue">**Analysis**</span>: In order to investigate the correlations between total sales (trainsactions) and impression factors within a specific time period, we implemented linear regression model for marketing mix modeling.
Model 1: Basic regression
Model 2: Add in a day of week variable
Model 3: Taking the advertising effect into consideration
Model 4: Interactions between Email and Social factors
- <span style="color:dodgerblue">**Implementation:Let’s take a look at Model 3**</span>
Account for the decay of advertising: An ad had its biggest **advertising effect** just after it is shown to the users and then the effect wears over the time.
```{r, echo=FALSE}
par(mfrow=c(2,2))
plot(mdata$`Email ad effect`, type="l", xlab="Time", ylab="Email")
plot(mdata$`Display ad effect`, type="l", xlab="Time", ylab="Display")
plot(mdata$`Direct ad effect`, type="l", xlab="Time", ylab="Direct")
plot(mdata$`Social ad effect`, type="l", xlab="Time", ylab="Social")
```
## Method 3: Marketing Mix Model (MMM)
```{r}
model.3<-fit.model(dt=mdata[10:nrow(mdata),c(1,8:11)], "Sales",
input.names=c("Email ad effect","Display ad effect","Direct ad effect","Social ad effect"),
model.type="linear", digits = 3)
datatable(model.3)
```
- <span style="color:dodgerblue">**Results & Interpretation: **</span>
Positive effects for all forms of advertising.
**Email** and **Direct** appears to have similar influence on sales at about 50 additional sales.
All effects are statistically significant except for **Display**. **Display** still has a high standard error because there is not a big change after the adjustment.
## Method 4: Model-based Attribution Analysis
```{r data_preparation, echo=FALSE}
adatal <- as.data.frame(xtabs(~ id + date + channel, data=impressions), stringsAsFactors=FALSE)
adatal$id <- as.integer(adatal$id)
adatal$date <- as.Date(adatal$date)
adatal$channel <- as.factor(adatal$channel)
dimnames(adatal)[[2]][4] <- "impr"
# Add in observations for users with no impressions
pop <- unique(customer$id)
no.impress.ids <- pop[!(pop %in% unique(impressions$id))]
dates <- sort(unique(impressions$date))
channels <- unique(impressions$channel)
no.impress.obs <- data.frame(id=rep(no.impress.ids, each=length(dates)*length(channels)),
date=rep(rep(dates, each=length(channels)), length(no.impress.ids)),
channel=rep(channels, length(no.impress.ids)*length(dates)),
impr=rep(0, length(dates)*length(no.impress.ids)*length(channels)),
stringsAsFactors=FALSE)
no.impress.obs$channel <- as.factor(no.impress.obs$channel)
adatal <- rbind(adatal, no.impress.obs)
# Convert from long to wide format
adata <- reshape(adatal, direction="wide", v.names="impr", idvar=c("id", "date"),
timevar="channel", new.row.names=NULL)
# Add transactions
atrans <- as.data.frame(xtabs(~ id + date, data=transactions), stringsAsFactors=FALSE)
atrans$id <- as.integer(atrans$id)
atrans$date <- as.Date(atrans$date)
dimnames(atrans)[[2]][3] <- "trans"
adata <- merge(adata, atrans, by=c("id", "date"), all=TRUE)
adata$trans[is.na(adata$trans)] <- 0 # Fill in zeros for transactions
# Final tidy up
# Remove first and last days (which are incomplete)
adata <- adata[adata$date!="2016-12-31" & adata$date != "2017-02-28" & adata$date != "2017-02-27",]
# Add customer info from cust table
adata <- merge(adata, customer, by=c("id"))
# Tidy up column names
dimnames(adata)[[2]][3:11] <- c("direct", "display", "email", "email.holdout", "social", "trans", "past.purchase", "has.email", "has.direct")
rm(adatal, atrans)
```
- <span style="color:dodgerblue">**Analysis: **</span>Different from the MMM method, model-based attribution focuses on the **user level** by relating user transactions to prior advertising impressions. By visualizing this relationship as below, it is easy to see that users convert more on days they get emails or direct mails.
```{r plots, echo=FALSE}
par(mfrow=c(2,2))
plot(aggregate(trans~direct, data=adata, FUN=mean), type="h", ylim=c(0,0.15),
xlab="Impressions on Day", main="Direct Mail")
plot(aggregate(trans~email, data=adata, FUN=mean), type="h", ylim=c(0,0.15),
xlab="Impressions on Day", main="Email")
plot(aggregate(trans~display, data=adata, FUN=mean), type="h", ylim=c(0,0.15),
xlab="Impressions on Day", main="Display")
plot(aggregate(trans~social, data=adata, FUN=mean), type="h", ylim=c(0,0.15),
xlab="Impressions on Day", main="Social Media")
```
## Method 4: Model-based Attribution Analysis
- <span style="color:dodgerblue">**Implementation:**</span>
```{r fit_in_model}
model.4<-fit.model(dt = adata, "trans", input.names=c("direct", "display","email", "social", "past.purchase"),
model.type="logistic", digits = 3)
datatable(model.4)
```
- <span style="color:dodgerblue">**Results & Interpretation: **</span>
From the summary, we could tell that **email** has the highest positive impact on the transaction - each email could increase 114% of probability for the transaction. The second highest channel is **direct mail** - each direct mail could increase 51% of probability for the transaction. Another notable fact is that the customers who have purchased before will significantly and positively impact the transactions - they are 160% more likely to purchase.
## Limitations and Uncertainties
- <span style="color:dodgerblue">**Simulated dataset**</span>: data is simulated but not real
- <span style="color:dodgerblue">**Other marketing channels**</span>: marketing channels in this case are limited
- <span style="color:dodgerblue">**Holdout test on Tuesday only**</span>: On 2017-01-03, 2017-01-17, 2017-01-24, 2017-01-31, 2017-02-07, 2017-02-14, and 2017-02-21, we conducted holdout test. But we cannot see if the result would be different on other days during a week.
- <span style="color:dodgerblue">**Small dataset**</span>: After combing the data, the informative dataset is not large enough to reach solid conclusions. Additionally, as the daily display impressions in the simulated dataset are pretty much the same, the data is not that informative from this perspective.