Skip to content

Commit

Permalink
scrape with cloudscraper
Browse files Browse the repository at this point in the history
requests.get now returns 403 Forbidden
  • Loading branch information
hejops committed Aug 11, 2024
1 parent 0b4f983 commit cb1cf16
Showing 1 changed file with 79 additions and 37 deletions.
116 changes: 79 additions & 37 deletions dita/scrape/bandcamp.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
#!/usr/bin/env python3
"""Module for scraping Bandcamp (since RSS feeds have been discontinued)
"""
"""Module for scraping Bandcamp (since RSS feeds have been discontinued)"""
import json
import math
import os
from datetime import datetime
from time import sleep
from typing import Iterator

import requests
import tqdm
from bs4 import BeautifulSoup
from cloudscraper import CloudScraper

BC_SUBS_FILE = f"{os.path.expanduser('~')}/.config/newsboat/bandcamp"
SCRAPER = CloudScraper()


def get_album_age(album_url: str) -> int:
Expand All @@ -21,54 +23,64 @@ def get_album_age(album_url: str) -> int:
Bandcamp date strings are always in the following format:
'release[ds] July 29, 2022'
"""
sleep(1)
try:
page = requests.get(album_url, timeout=3)
page = SCRAPER.get(album_url, timeout=3)
except requests.exceptions.ReadTimeout:
return -1
except requests.exceptions.ConnectionError:
print("?", album_url)
return -1
soup = BeautifulSoup(page.content, "html.parser")
album_credits = soup.find(
"div",
attrs={"class": "tralbum-credits"},
)

if not album_credits:
raise NotImplementedError
print("error", album_url)
return -1
# raise NotImplementedError

release_date = [
line
for line in album_credits.text.split("\n")
if line.strip().startswith("release")
][0]

# print(release_date)

return (
days = (
datetime.now()
- datetime.strptime(
release_date.split(maxsplit=1)[1],
"%B %d, %Y",
)
).days

# print(album_url, release_date, days)
return days


def get_label_albums(
label_name: str,
max_days: int = 7,
n: int = 7,
verbose: bool = False,
) -> Iterator[str]:
) -> Iterator[str] | None:
"""Retrieve albums on the first page of a Bandcamp label's releases
published within the last <n> days.
published within the last `n` days.
"""
label_url = f"https://{label_name}.bandcamp.com/music"

try:
page = requests.get(label_url, timeout=3)
except requests.exceptions.ReadTimeout:
page = SCRAPER.get(label_url, timeout=3)
except (
requests.exceptions.ReadTimeout,
requests.exceptions.ConnectionError,
):
print("timeout:", label_url)
return []
return

soup = BeautifulSoup(page.content, "html.parser")
for album in soup.find_all(
albums = soup.find_all(
"li",
attrs={
# different layouts must be accounted for
Expand All @@ -79,64 +91,94 @@ def get_label_albums(
"music-grid-item square first-four featured",
],
},
):
)
# print(label_url, len(albums), "albums")
for i, album in enumerate(albums):
if album.a["href"].startswith("https"):
url = album.a["href"] # external urls
else:
url = label_url.removesuffix("/music") + album.a["href"]

if "/album/" not in url:
continue
if not 0 < get_album_age(url) <= max_days:
if not url.startswith("http"):
print("?", url, label_url)
continue
age = get_album_age(url)
if age < 0:
continue

if age > n:
# TODO: albums may not be displayed in chronological order!
# (sentientruin)
if i == 0:
continue
if age > 1000:
print("!" * (int(math.log2(age))), label_url)
break
if verbose:
print(url)
yield url.partition("?")[0]


def get_user_subscriptions(username: str) -> list[str]:
"""Retrieve a list of Bandcamp labels followed by a user, with a single
POST request. Fairly quick.
"""Retrieve a list of Bandcamp labels followed by `username`. Uses a single
`POST` request, which is required because only the first 45 labels are
returned in the HTML response. Fairly quick.
https://michaelherger.github.io/Bandcamp-API
https://bandcamp.com/developer/account
"""
# Based on:
# https://github.com/bembidiona/bandcamp-fan-feed/blob/master/bandcamp-fan-feed.py

url = f"https://bandcamp.com/{username}"
# print(url)
soup = BeautifulSoup(
requests.get(f"https://bandcamp.com/{username}", timeout=3).content,
SCRAPER.get(url, timeout=3).content,
"html.parser",
)
assert "403 Forbidden" not in soup.text, soup
# TODO: figure out what causes this
assert (
len(soup.find_all("a", {"class": "fan-username"})) == 45
), "cloudscraper failed to scrape correctly"

# API calls require fan_id, which is not exposed by the API
user_id = soup.find(type="button")["id"].split("_")[1]

with requests.Session() as sess:
sess.get(f"https://bandcamp.com/{username}/following/artists_and_labels")
# clicks the see more button
# # https://stackoverflow.com/a/64419449
# user_id = json.loads(soup.find(id="pagedata")["data-blob"])["fan_data"]["fan_id"]

# with requests.Session() as sess:
# # click the see more button
# sess.get(f"https://bandcamp.com/{username}/following/artists_and_labels")

# curl 'https://bandcamp.com/api/fancollection/1/following_bands' --data-raw '{"fan_id":123,"older_than_token":"99999999999:0","count":99999999999}'

following = sess.post(
following = SCRAPER.post(
"https://bandcamp.com/api/fancollection/1/following_bands",
json={
"fan_id": user_id,
# HACK: set older_than_token and count to absurdly large integers
# (we just need current epoch)
"older_than_token": "9999999999:9999999999",
"count": 9999,
},
)
followed = json.loads(following.text)

# yes, 'followeers' is not a typo...
return sorted(x["url_hints"]["subdomain"].strip() for x in followed["followeers"])
following = json.loads(following.text)["followeers"]

foo = sorted(x["url_hints"]["subdomain"].strip() for x in following)
# print(foo, len(foo))
return foo

def get_albums_of_week(username: str) -> list[str]:
"""Get list of URLs of Bandcamp releases published in the past week.

While we could return some kind of dict - to match columns of rss.py,
namely: ["title", "author", "feedurl", "url"] - rss needs to extract info
of url-only bc urls anyway, so... never mind."""
def get_albums_of_week(username: str) -> set[str]:
"""Get list of URLs of Bandcamp releases published in the past week"""
# 8 min / 286

labels = get_user_subscriptions(username)

albums = []
for label in tqdm.tqdm(labels):
albums += list(get_label_albums(label))
return albums
for label in labels: # tqdm.tqdm(labels):
albums += list(get_label_albums(label, verbose=True))
return set(albums)

0 comments on commit cb1cf16

Please sign in to comment.