-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathItemSetFrequencyAnalysis.R
121 lines (87 loc) · 4.06 KB
/
ItemSetFrequencyAnalysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#Predicting items with highest sales
#Algorithm: Apriori
#Packages Required:
# 1. gdata - To read CSV files
# 2. arules - For manipulating and analyzing transaction data and patterns
# - Also provides interfaces to C implementations of the association mining algorithms Apriori and Eclat
# 3. e1071 - Library for modelling svm.
#About Dataset:
# Total Training Sets = 983
# Total Products = 98
#Load the libraries
library("arules")
library("e1071")
library("gdata")
#Load the dataset
dataset<-read.csv(file="data.csv",header=T,sep=",")
subset<-dataset[c(2,5)]
subset<-subset[!duplicated(subset[c(1,2)]),]
aggSubSet<-split(subset$Product.Name,subset$Customer.Name)
trns<-as(aggSubSet,"transactions")
Rules<-apriori(trns,parameter=list(supp=0.05,conf=0.6,target="rules",
minlen=2))
ItemSet<-inspect(Rules[1:100])
#write.csv(ItemSet, file= "ItenSet.csv")
#create an item frequency bar plot for inspecting the item frequency distribution of products sold
#Top 10 items being purchased
itemFrequencyPlot(trns, topN = 30, type="absolute",
xlab="Most Frequent Item (MFI)",
ylab="Frequency",main="Top 30 Frequent Items")
#Compare Item Frequencies
#Transactions of the item with large frequencr (frankfurter)
#with the average in dataset
trns.high<-trns[trns %in% "frankfurter"]
# plot with the averages of the population plotted as a line
#(for first 20 items)
#itemFrequencyPlot(trns.high[, 1:20],type="relative",population = trns[, 1:20])
# plot lift ratio (frequency in x / frequency in population)
# for items with a support of 20% in the population
itemFrequencyPlot(trns.high, population = trns, support = 0.2,lift = TRUE, horiz = TRUE)
barplot(sort(itemFrequency(trns), decreasing=TRUE))
########################################################################################################
########################################################################################################
################ Analyzing purchase of Most frequent Item using SVM ################################
#######################Item Analyzed is Frankfurter ####################################################
svmSubset<-dataset[c(3,5,6,7)]
svmSubset <- svmSubset[svmSubset$Product.Name =="frankfurter",]
svmSubset<-svmSubset[c(1,3,4)]
##svmSubset$Quantity <- svmSubset$Unit.Price * svmSubset$Quantity
svmSubset$Total.Cost <- svmSubset$Unit.Price * svmSubset$Quantity
####names(svmSubset)[3] <- "Total.Cost"
svmSubset$Order.Year <- as.numeric(format(as.Date(svmSubset$Order.Date,"%m/%d/%Y"),"%Y"))
svmSubset$Order.Date <- as.numeric(format(as.Date(svmSubset$Order.Date,"%m/%d/%Y"),"%m"))
names(svmSubset)[1] <- "Order.Month"
for(i in 1:length(svmSubset$Total.Cost))
{
med <- median(svmSubset$Total.Cost)
if((svmSubset$Total.Cost[i])>med)
svmSubset$Order.Val[i]="High"
else
svmSubset$Order.Val[i]="Low"
}
svmSubset2 <- svmSubset[c(6,1,3)]
processData <- write.csv(svmSubset2, file= "newdata.csv")
#processData<-read.csv(file="newdata.csv",header=T,sep="," )
cleanData <- read.csv(file="newdata.csv",header=T,sep=",")
cleanData <- cleanData[c(2,3,4)]
model<-svm(Order.Val ~ .,data=cleanData)
#model<-svm(Order.Val ~ .,data=cleanData,kernel="polynomial",degree=3,coef0=0.030,cost=1.3,tolerance=0.008)
#model<-svm(Order.Val ~ .,data=cleanData,kernel="radial",gamma=0.01,cost=10)
#plot(model,cleanData)
#Now, next step is to do the prediction.
#So we classify 75% of dataset as training dataset and 25% as testing dataset
#Create an index
index<-1:nrow(cleanData)
#Create testindex to sample out the 30% of the dataset
testindex<-sample(index,trunc(length(index)*25/100))
#Segregate the testdataset and trainingdataset using the testindex
testset<-cleanData[testindex,]
trainingset<-cleanData[-testindex,]
#Plot model on trainingset
plot(model,trainingset)
#Prediction using the model which has been trained using the trainingset
prediction<-predict(model,testset[,-1])
#Generate the confusion matrix to find the accuracy
tab <-table(pred=prediction,true=testset[,1])
accuracy <- ((tab[1]+tab[4])/sum(tab))*100
accuracy