-
Notifications
You must be signed in to change notification settings - Fork 2
/
Workflow for uninterrupted extraction of Facebook Pages content
159 lines (120 loc) · 4.16 KB
/
Workflow for uninterrupted extraction of Facebook Pages content
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
### Workflow for extracting Facebook pages found with specific keywords together with all of their content (posts, comments, replies)###
### The code was developed as part of the Ancient Identities in Modern Britain (IARH) project
### It extracts public Facebook pages found with specified keywords
### Subsequently, it extracts all posts, comments and replies on this pages, automatically handling common errors encountered using Rfacebook library
### and thus allowing continous data download without user intervention
### Project: Ancient Identities in Modern Britain (IARH); ancientidentities.org
### Author: Marta Krzyzanska
### Some fragments of the code are based on: Facebook-scraper.R developed by Chiara Bonacchi as part of the IAHR project
#####PART 1: SET UP #####
# Set working directory
setwd("~path")
#Load all the required libraries:
library(Rfacebook)
library(devtools)
library(xlsx)
#Load or create the token to be used while extracting the material
load("token.R")
#Load or create the required list of keywords or set the list of keywords
keywords<-read.csv("keywords.csv",head=FALSE)
pageKeywords<-as.character(keywords[,1])
#Set up the error handling function provided at the following link: https://github.com/IARHeritages/Facebook-codes/blob/master/Error%20handling%20with%20tryCatch
##### PART 2: GET ALL THE PAGES #####
###Loop to get the pages
#Save all the pages in the list:
listPages <- c()
j=length(pageKeywords)+1
i=1
while (i < j){
print(i)
b <- handleAPIErrors(pageKeywords[i],token,"pages",100000000000)
b$pageKeywords <- pageKeywords[i]
listPages[[i]] <- b
i=i+1
}
#Save the pages in an r file:
save(listPages,file="listPages.R")
#####PART 3: GET ALL THE POSTS:#####
#Gets all the posts on these pages
#Once posts on the pages found with the given keywords are downloaded they are saved in an R file (listPostsi, where i represents the keywords) that contains the nested list, where listPosts[[k]] contains a dataframe of posts found on the facebook page k.
i=1
j=length(listPages)+1
while (i < j){
print(paste("Downloading posts for keyword",pageKeywords[i],spe=""))
listPosts<- c()
page <- listPages[[i]]
k=1
l=length(page$id)+1
while (k<l){
print (k)
id<-page$id[k]
b <- handleAPIErrors(id,token,"posts",100000000000)
listPosts[[k]]<-b
k=k+1
}
save(listPosts,file=paste("listPosts",i,".R",sep=""))
i=i+1
}
#####PART 4: GET ALL THE COMMENTS ON THE PAGES:#####
#Gets all the posts on the pages
#Once comments on the pages found with the given keywords are downloaded they are saved in an R file (commentsi, where i represents the keywords) that contains the nested list, where comments[[k]][[m]]$comments contains a dataframe of comments found under post m on page k.
i=1
j=length(listPages)+1
while (i<j){
print (paste("Downloading comments for keyword",pageKeywords[i],spe=""))
load(paste("listPosts",i,".R",sep=""))
comments <- c()
k=1
l=length(listPosts)+1
while(k<l){
print(k)
com2<-c()
m=1
n=length(listPosts[[k]]$id)+1
while(m<n){
id<-listPosts[[k]]$id[m]
b<-handleAPIErrors(id,token,"comments",10000000000)
com2[[m]]<-b
m=m+1
}
comments[[k]]<-com2
k=k+1
}
save(comments,file=paste("comments",i,".R",sep=""))
i=i+1
}
########PART 5: GET ALL THE REPLIES ON THE PAGES:
#Gets all the replies on the pages
#Once replies on the pages found with the given keywords are downloaded they are saved in an R file (commentsi, where i represents the keywords) that contains the nested list, where replies[[k]][[m]][[o]]$replies contains a dataframe of replies found under the comment o, wrote under post m on page k.
i=1
j=length(listPages)+1
while (i<j){
print(paste("Downloading replies for keyword",pageKeywords[i],spe=""))
load(paste("comments",i,".R",sep=""))
replies <- c()
k=1
l=length(comments)+1
while(k<l){
rep2<-c()
m=1
n=length(comments[[k]])+1
while(m<n){
rep3<-c()
o=1
p=length(comments[[k]][[m]]$comments$id)+1
while(o<p){
print (o)
id<-comments[[k]][[m]]$comments$id[o]
b<-handleAPIErrors(id,token,"replies",10000000000)
rep3[[p]]<-b
o=o+1
}
rep2[[m]]<-rep3
m=m+1
}
replies[[k]]<-rep2
k=k+1
}
save(replies,file=paste("replies",i,".R",sep=""))
i=i+1
}