-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawler.py
58 lines (51 loc) · 1.97 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import requests
from bs4 import BeautifulSoup
import time
import os
def url_i(i):
#Get the URL of every page
url = "https://www.findamasters.com/masters-degrees/msc-degrees/?PG={}".format(i)
return url
def get_MasterURLS():
for i in range(1, 401):
links_list = set()
url = url_i(i)
response = requests.get(url)
if response.status_code == 200:
# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# Find all the links in the HTML
links = soup.find_all('a', href=True)
# Print all the links
for link in links:
l = link['href']
splitted = l.split("/")
if splitted[-1][0:2] == "?i" and len(splitted[-1].split("#")) != 2:
links_list.add(l)
if len(links_list) == 15:
with open("links.txt", 'a') as f:
for l in links_list:
f.write(l+ '\n')
time.sleep(1)
def main():
get_MasterURLS()
j = 0
with open('links.txt', 'r') as file:
# Read the content of the file
file_content = file.readlines()
for page in range(1, 401):
folder_name = "page_{}".format(page)
#Create a directory for the page if not exists
if not os.path.exists(folder_name):
os.makedirs(folder_name)
for web in range(j, 15 + j):
#Save HTML content for every master page
website = "https://www.findamasters.com" + file_content[web].replace("\n", "")
response = requests.get(website)
#Sleep to don't saturate the server
time.sleep(2)
with open(folder_name + "/" + "page{}_master{}.html".format(page, web), 'a', encoding='utf-8') as file:
file.write(response.text)
j = page * 15
print("Page " + str(page) + " has been completed.")
main()