-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy patharchive_example.py
52 lines (44 loc) · 1.45 KB
/
archive_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from bs4 import BeautifulSoup
import requests
# News site URL info
BASE_URL = "https://psmag.com"
AUTHOR_URL = BASE_URL + "/author/nick-hagar-1"
def collect_urls(author_page):
"""Scrape author page for article links
Args:
author_page (str): URL for author page
"""
r = requests.get(author_page)
page_html = r.text
soup = BeautifulSoup(page_html)
link_elements = soup.find_all("phoenix-super-link")
links = [BASE_URL + i["href"] for i in link_elements]
return links
def send_to_wayback(links):
"""Archive stories via Wayback Machine
Args:
links (list): List of URLs to archive
"""
for i in links:
save_url = f"https://web.archive.org/save/{i}"
response = requests.get(save_url)
if response.status_code == 200:
result = response.url
# We're just printing the archive URLs, but we could save them too!
print(f"✔ {result}")
else:
print(f"❌ Connection error for {i}")
def main():
print("💻 Starting link collection")
# Grab story links from author page
links = collect_urls(AUTHOR_URL)
print(f"✅ {len(links)} stories collected")
print("📂 Archiving most recent 10")
# Limit to most recent 10
recent_links = links[:10]
print("🔗 Archive links:")
# Send to Wayback Machine
send_to_wayback(recent_links)
print("🎉 Success!")
if __name__ == "__main__":
main()