-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAssignment 5 Clinical Trial.R
105 lines (72 loc) · 2.55 KB
/
Assignment 5 Clinical Trial.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Assignment 5 Clinical Trial
# Read in the data
clinical_trial = read.csv("clinical_trial.csv", stringsAsFactors=FALSE)
str(clinical_trial)
max(nchar(clinical_trial$abstract))
which.min(nchar(clinical_trial$title))
# Install new packages
#install.packages("tm")
library(tm)
#install.packages("SnowballC")
library(SnowballC)
# TITLE
corpusTitle = Corpus(VectorSource(trials$title))
corpusTitle = tm_map(corpusTitle, tolower)
corpusTitle = tm_map(corpusTitle, PlainTextDocument)
corpusTitle = tm_map(corpusTitle, removePunctuation)
corpusTitle = tm_map(corpusTitle, removeWords, stopwords("english"))
corpusTitle = tm_map(corpusTitle, stemDocument)
dtmTitle = DocumentTermMatrix(corpusTitle)
dtmTitle = removeSparseTerms(dtmTitle, 0.95)
dtmTitle = as.data.frame(as.matrix(dtmTitle))
# ABSTRACT
corpusAbstract = Corpus(VectorSource(trials$abstract))
corpusAbstract = tm_map(corpusTitle, tolower)
corpusAbstract = tm_map(corpusTitle, PlainTextDocument)
corpusAbstract = tm_map(corpusTitle, removePunctuation)
corpusAbstract = tm_map(corpusTitle, removeWords, stopwords("english"))
corpusAbstract = tm_map(corpusTitle, stemDocument)
dtmAbstract = DocumentTermMatrix(corpusAbstract)
dtmAbstract = removeSparseTerms(dtmAbstract, 0.95)
dtmAbstract = as.data.frame(as.matrix(dtmAbstract))
colnames(dtmTitle) = paste0("T", colnames(dtmTitle))
colnames(dtmAbstract) = paste0("A", colnames(dtmAbstract))
dtm = cbind(dtmTitle, dtmAbstract)
dtm$trial = trials$trial
# Split the data
library(caTools)
set.seed(144)
split = sample.split(dtm$trial, SplitRatio = 0.7)
trainSparse = subset(dtm, split==TRUE)
testSparse = subset(dtm, split==FALSE)
table(trainSparse$trial)
# Video 7
# Build a CART model
library(rpart)
library(rpart.plot)
trialCART = rpart(trial ~ ., data=trainSparse, method="class")
prp(trialCART)
predTrain = predict(trialCART)[,2]
summary(predTrain)
table(trainSparse$trial, predTrain>=0.5)
# Evaluate the performance of the model
predictCART = predict(trialCART, newdata=testSparse)[,2]
table(testSparse$trial, predictCART>=0.5)
# Compute accuracy
#(294+18)/(294+6+37+18)
# Baseline accuracy
table(testSparse$Negative)
#300/(300+55)
# Random forest model
library(randomForest)
set.seed(123)
tweetRF = randomForest(Negative ~ ., data=trainSparse)
# Make predictions:
predictRF = predict(tweetRF, newdata=testSparse)
table(testSparse$Negative, predictRF)
# Accuracy:
#(293+21)/(293+7+34+21)
set.seed(123)
tweetlm = glm(Negative ~ ., data=trainSparse, family="binomial")
predictlm = predict(tweetlm, newdata=testSparse, type="response")
table(testSparse$Negative, predictlm>=0.5)