-
Notifications
You must be signed in to change notification settings - Fork 0
/
getWeb.py
46 lines (32 loc) · 1.09 KB
/
getWeb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
def get_links(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
links = set()
for link in soup.find_all('a'):
href = link.get('href')
if href and urlparse(href).netloc == domain and '#' not in href:
full_url = urljoin(url, href)
links.add(full_url)
return links
def save_content(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
page_text = soup.get_text()
domain = urlparse(url).netloc
with open(f'{domain}.txt', 'a', encoding='utf-8') as file:
file.write(f"\n\n--- {url} ---\n\n")
file.write(page_text)
print(f"{url} の内容が {domain}.txt に追記されました。")
crawled_urls = set()
def crawl(url):
if url not in crawled_urls:
crawled_urls.add(url)
save_content(url)
links = get_links(url)
for link in links:
crawl(link)
def crawl_start(url):
crawl("https://" + url)