-
Notifications
You must be signed in to change notification settings - Fork 2
/
scraper.js
224 lines (208 loc) · 8.08 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import * as cheerio from "cheerio";
import FetchHtml from './utilities/helpers.js';
import fs from 'fs';
// Handles all fetching and parsing
// Takes in text-html of the 'all programs' page of the year you want to scrape
// Returns a matrix of info to be used to hydrate models in database
// NOTE: private methods are weird in JS so they are not used, but the only real public method is getandorderinfomatrix()
class Program {
constructor(fetchedHtml) {
this.$ = cheerio.load(fetchedHtml);
this.allProgramsObject = this.$(".content").find("tbody").find('tr');
this.insideProgramsArray = [];
this.totalPrograms = this.allProgramsObject.length;
this.programInfo = [];
this.butteUrl = "https://programs.butte.edu";
}
GetProgramLinks = () => {
let programLinks = [];
let Index = 0;
while(Index < this.totalPrograms) {
let currentLink = this.butteUrl + this.$(this.allProgramsObject[Index]).find('a').attr('href');
programLinks.push(currentLink);
Index = Index + 1;
}
return programLinks;
}
GetProgramNames = () => {
let programNames = [];
let Index = 0;
while(Index < this.totalPrograms) {
let currentName = this.$(this.allProgramsObject[Index]).find('a').first().text().trim();
programNames.push(currentName);
Index = Index + 1;
}
return programNames;
}
GetProgramTypes = () => {
let programTypes = [];
let Index = 0;
while(Index < this.totalPrograms) {
let currentType = this.$(this.allProgramsObject[Index]).find("td").eq(0).text().trim();
programTypes.push(currentType);
Index = Index + 1;
}
return programTypes;
}
GetProgramDepts = () => {
let programDepts = [];
let Index = 0;
while(Index < this.totalPrograms) {
let currentDept = this.$(this.allProgramsObject[Index]).find("td").eq(1).text().trim();
programDepts.push(currentDept);
Index = Index + 1;
}
return programDepts;
}
GetProgramCodes = () => {
let programCodes = [];
let Index = 0;
while(Index < this.totalPrograms) {
let currentCode = this.$(this.allProgramsObject[Index]).find("td").last().text().trim();
programCodes.push(currentCode);
Index = Index + 1;
}
return programCodes;
}
// Start of methods for 2nd fetch, in each program
// Uses links from the all programs page
// loops through following each link and grabbing the html
// dumps that into an array in class variables
FetchEachProgram = async () => {
let allPrograms = [];
const totalLinks = this.programInfo[0].length;
// grabbing all html from each program
for (let i = 0; i < totalLinks; i++) {
console.log((i+1) + '/' + totalLinks);
const currentProgram = fs.readFileSync('./dev-programs/prog' + (i+1) + '.txt', 'utf8', function(err){
if (err) throw err;
})
allPrograms.push(currentProgram);
}
this.insideProgramsArray = allPrograms;
}
GetProgramsAbouts = () => {
let programsAbouts = [];
const totalPrograms = this.insideProgramsArray.length;
for (let i = 0; i < totalPrograms; i++) {
// Pull in next program from array and set up parser
this.$ = cheerio.load(this.insideProgramsArray[i]);
const programContent = this.$(".content");
// Get program about section, add place holder if their isnt one.
let aboutSection = programContent.find("#description:nth-child(1)").find('p').text();
if (aboutSection.length == 0) {
aboutSection = 'Needs about section';
};
// Push current about to function scope array
programsAbouts.push(aboutSection);
}
return programsAbouts;
}
GetProgramsChairs = () => {
let programsChairs = [];
const totalPrograms = this.insideProgramsArray.length;
for (let i = 0; i < totalPrograms; i++) {
// Pull in next program from array and set up parser
this.$ = cheerio.load(this.insideProgramsArray[i]);
const programContent = this.$(".content");
// Get chair of program
let chair = programContent.find(".bg-darkgray-1.p-15.border-radius-5.white.mb-30").find("p").first().text().trim();
chair = chair.split(",")[0];
// Push current chair to function scope array
programsChairs.push(chair);
}
return programsChairs;
}
GetProgramsSlos = () => {
let programsSlos = [];
const totalPrograms = this.insideProgramsArray.length;
for (let i = 0; i < totalPrograms; i++) {
// Pull in next program from array and set up parser
this.$ = cheerio.load(this.insideProgramsArray[i]);
const programContent = this.$(".content");
// This pushes an array of a programs slos into an array of all programs slos
let currentSloStore = []
let allProgramsScoped = this.insideProgramsArray;
programContent.find(".dots").children().each(function (j, elem) {
let $$ = cheerio.load(allProgramsScoped[i])
currentSloStore[j] = $$(elem).text().trim();
});
programsSlos.push(currentSloStore);
}
return programsSlos;
}
//TODO: refactor, maybe along with the other inprograms, a lot of repeated code
// BUG: not critical, some of the course descriptions are getting cut short, probably the and/or parsing
GetProgramsCourses = () => {
let programsCourses = [];
const totalPrograms = this.insideProgramsArray.length;
// going through each program
for (let i = 0; i < totalPrograms; i++) {
console.log('course' + (i+1) + '/' + totalPrograms)
this.$ = cheerio.load(this.insideProgramsArray[i]);
const programContent = this.$(".content");
let currentCoursesStore = [];
let allProgramsScoped = this.insideProgramsArray;
// once inside program
programContent.find(".classLinks").children().each(function (j, elem) {
let $$ = cheerio.load(allProgramsScoped[j])
let currentElem = $$(elem).text().trim() + ',';
if (currentElem.search('or ') !== -1) {
currentElem = currentElem.slice(3);
} else if (currentElem.search('and ') !== -1) {
currentElem = currentElem.slice(4);
}
currentCoursesStore.push(currentElem);
})
// format the data so each index is a full course description
let numberOfCourses = currentCoursesStore.length / 3;
let tempStore = []
let ti = 0;
for (let i = 0; i < numberOfCourses; i++) {
tempStore.push(currentCoursesStore.slice(ti, ti+3).join(' '))
ti = ti + 3;
}
currentCoursesStore = tempStore;
programsCourses.push(currentCoursesStore)
}
return programsCourses;
}
// populate infoMatrix class variable with data from all programs page
SetAllPrograms = () => {
let links = this.GetProgramLinks();
let names = this.GetProgramNames();
let types = this.GetProgramTypes();
let depts = this.GetProgramDepts();
let codes = this.GetProgramCodes();
this.programInfo.push(links);
this.programInfo.push(names);
this.programInfo.push(types);
this.programInfo.push(depts);
this.programInfo.push(codes);
}
// populate infoMatrix class variable with data from inside each program
SetAllInnerPrograms = () => {
let abouts = this.GetProgramsAbouts();
let chairs = this.GetProgramsChairs();
let slos = this.GetProgramsSlos();
let courses = this.GetProgramsCourses(); // Not using currently, commented out to save compute
this.programInfo.push(abouts);
this.programInfo.push(chairs);
this.programInfo.push(slos);
this.programInfo.push(courses);
}
// This is the only method that should be used from the class outside of the class
// something like encapsulation minus the part that enforces it... lol
Scrape = async () => {
// setting the infoMatrix with links/names/types/depts/codes.
this.SetAllPrograms();
// performing the second fetch with links from setAllPrograms().
await this.FetchEachProgram();
// setting the infoMatrix with abouts/chairs/slos.
this.SetAllInnerPrograms();
// returns the multi-d array
return this.programInfo;
}
}
// fetch can to be used to construct class outside of here
export { FetchHtml, Program };