-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
97 lines (92 loc) · 3.21 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""
This is the main script to be called upon.
The main function gets executed automatically on calling the script directly.
The other functions are for entering the data into the sqllite db file.
"""
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
from scraper_utils import (
getBuiltWith,
getImages,
getLikes,
getSubtitle,
getTitle,
getContent,
getUpdates,
getParticipants,
getUserData,
)
from database_utils import (
makeDatabase,
initializeDatabase,
insertFollowing,
insertImage,
insertParticipant,
insertParticipation,
insertProject,
insertUpdate,
insertProjectUses,
insertTechnology,
)
import sys
BASEURL = "https://wirvsvirushackathon.devpost.com"
SUBMISSIONS_URL = BASEURL + "//submissions?page="
def main():
if len(sys.argv) > 1:
conn, cursor = makeDatabase(sys.argv[1])
else:
raise ValueError(
"provide at least a path where the database should be saved to"
)
initializeDatabase(conn, cursor)
pageCount = 1
while True:
subsObj = BeautifulSoup(
urlopen(SUBMISSIONS_URL + str(pageCount)), "html.parser"
)
submissions = subsObj.findAll(
"a", {"class": "block-wrapper-link fade link-to-software"}
)
if len(submissions) != 0:
for submission in submissions:
try:
subUrl = submission.attrs["href"]
subObj = BeautifulSoup(urlopen(subUrl), "html.parser")
title = getTitle(subObj)
title_text = title.get_text().strip()
project_id = insertProject(
cursor,
title_text,
getSubtitle(subObj, title).get_text().strip(),
getContent(subObj),
subUrl,
getLikes(subObj),
)
print(f"Collecting project {project_id}:{title_text}")
participants = getParticipants(subObj)
for participant in participants:
p_data = getUserData(participant)
insertParticipant(cursor, *p_data)
insertParticipation(cursor, project_id, participant)
updates = getUpdates(subObj)
for update in updates:
u_data = getUserData(update[0])
insertParticipant(cursor, *u_data)
insertUpdate(cursor, project_id, *update)
images = getImages(subObj)
for image in images:
insertImage(cursor, project_id, image)
builtWith = getBuiltWith(subObj)
for tech in builtWith:
insertTechnology(cursor, tech)
insertProjectUses(cursor, project_id, tech)
conn.commit()
except Exception as e:
print("Submission threw an error. Skipping.")
print(e)
pageCount = pageCount + 1
else:
break
if __name__ == "__main__":
main()