Skip to content

Commit

Permalink
Merge pull request #1 from NHagar/account-recs-
Browse files Browse the repository at this point in the history
User data collection and refactored newsletter logic
  • Loading branch information
NHagar authored Apr 3, 2024
2 parents b2e00f7 + d24ea96 commit f31a77f
Show file tree
Hide file tree
Showing 9 changed files with 532 additions and 25 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.conda/
__pycache__/
dist/
.env
.env
.vscode/
197 changes: 197 additions & 0 deletions poetry.lock

Large diffs are not rendered by default.

34 changes: 14 additions & 20 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,23 +1,17 @@
[build-system]
requires = ["hatchling",
"requests"]
build-backend = "hatchling.build"

[project]
[tool.poetry]
name = "substack-api"
version = "0.0.2"
authors = [
{ name="Nick Hagar", email="[email protected]" },
]
description = "The unofficial Substack API wrapper for Python."
version = "0.1.0"
description = "unofficial python wrapper for collecting substack data"
authors = ["NHagar <[email protected]>"]
license = "MIT"
readme = "README.md"
license = { file="LICENSE" }
requires-python = ">=3.8"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]

[project.urls]
"Homepage" = "https://github.com/NHagar/substack_api"
[tool.poetry.dependencies]
python = "^3.8"
requests = "^2.31.0"
beautifulsoup4 = "^4.12.3"


[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
File renamed without changes.
47 changes: 43 additions & 4 deletions src/substack_api/substack_api.py → substack_api/newsletter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,22 @@
from time import sleep
from typing import Dict, List, Tuple, Union

from bs4 import BeautifulSoup
import requests


HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}


def list_all_categories() -> List[Tuple[str, int]]:
"""
Get name / id representations of all newsletter categories
"""
endpoint_cat = "https://substack.com/api/v1/categories"
r = requests.get(endpoint_cat, headers=HEADERS)
categories = [(i['name'], i['id']) for i in r.json()]
categories = [(i["name"], i["id"]) for i in r.json()]
return categories


Expand Down Expand Up @@ -50,7 +53,12 @@ def category_name_to_id(name: str) -> int:
raise ValueError(f"{name} is not in Substack's list of categories")


def get_newsletters_in_category(category_id: int, subdomains_only: bool = False, start_page: int = None, end_page: int = None) -> List:
def get_newsletters_in_category(
category_id: int,
subdomains_only: bool = False,
start_page: int = None,
end_page: int = None,
) -> List:
"""
Collects newsletter objects listed under specified category
Expand Down Expand Up @@ -84,7 +92,12 @@ def get_newsletters_in_category(category_id: int, subdomains_only: bool = False,
return all_pubs


def get_newsletter_post_metadata(newsletter_subdomain: str, slugs_only: bool = False, start_offset: int = None, end_offset: int = None) -> List:
def get_newsletter_post_metadata(
newsletter_subdomain: str,
slugs_only: bool = False,
start_offset: int = None,
end_offset: int = None,
) -> List:
"""
Get available post metadata for newsletter
Expand All @@ -104,6 +117,9 @@ def get_newsletter_post_metadata(newsletter_subdomain: str, slugs_only: bool = F
full_url = f"https://{newsletter_subdomain}.substack.com/api/v1/archive?sort=new&search=&offset={offset_start}&limit=10"
posts = requests.get(full_url, headers=HEADERS).json()

if len(posts) == 0:
break

last_id = posts[-1]["id"]
if last_id == last_id_ref:
break
Expand All @@ -121,7 +137,9 @@ def get_newsletter_post_metadata(newsletter_subdomain: str, slugs_only: bool = F
return all_posts


def get_post_contents(newsletter_subdomain: str, slug: str, html_only: bool = False) -> Union[Dict, str]:
def get_post_contents(
newsletter_subdomain: str, slug: str, html_only: bool = False
) -> Union[Dict, str]:
"""
Gets individual post metadata and contents
Expand All @@ -137,3 +155,24 @@ def get_post_contents(newsletter_subdomain: str, slug: str, html_only: bool = Fa
return post_info["body_html"]
else:
return post_info


def get_newsletter_recommendations(newsletter_subdomain: str) -> List[Dict[str, str]]:
"""
Gets recommended newsletters for a given newsletter
Parameters
----------
newsletter_subdomain : Substack subdomain of newsletter (can be retrieved from `get_newsletters_in_category`)
"""
endpoint = f"https://{newsletter_subdomain}.substack.com/recommendations"
r = requests.get(endpoint, headers=HEADERS)
recs = r.text
soup = BeautifulSoup(recs, "html.parser")
div_elements = soup.find_all("div", class_="publication-content")
a_elements = [div.find("a") for div in div_elements]
titles = [i.text for i in soup.find_all("div", {"class": "publication-title"})]
links = [i["href"].split("?")[0] for i in a_elements]
results = [{"title": t, "url": u} for t, u in zip(titles, links)]

return results
77 changes: 77 additions & 0 deletions substack_api/user.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from typing import Dict, List

import requests

HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}


def get_user_id(username: str) -> int:
"""
Get the user ID of a Substack user.
Parameters
----------
username : str
The username of the Substack user.
"""
endpoint = f"https://substack.com/api/v1/user/{username}/public_profile"
r = requests.get(endpoint, headers=HEADERS)
user_id = r.json()["id"]
return user_id


def get_user_reads(username: str) -> List[Dict[str, str]]:
"""
Get newsletters from the "Reads" section of a user's profile.
Parameters
----------
username : str
The username of the Substack user.
"""
endpoint = f"https://substack.com/api/v1/user/{username}/public_profile"
r = requests.get(endpoint, headers=HEADERS)
user_data = r.json()
reads = [
{
"publication_id": i["publication"]["id"],
"publication_name": i["publication"]["name"],
"subscription_status": i["membership_state"],
}
for i in user_data["subscriptions"]
]
return reads


def get_user_likes(user_id: int):
"""
Get liked posts from a user's profile.
Parameters
----------
user_id : int
The user ID of the Substack user.
"""
endpoint = (
f"https://substack.com/api/v1/reader/feed/profile/{user_id}?types%5B%5D=like"
)
r = requests.get(endpoint, headers=HEADERS)
likes = r.json()["items"]
return likes


def get_user_notes(user_id: int):
"""
Get notes and comments posted by a user.
Parameters
----------
user_id : int
The user ID of the Substack user.
"""
endpoint = f"https://substack.com/api/v1/reader/feed/profile/{user_id}"
r = requests.get(endpoint, headers=HEADERS)
notes = r.json()["items"]
return notes
Empty file added tests/__init__.py
Empty file.
147 changes: 147 additions & 0 deletions tests/test_newsletter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import unittest
from unittest.mock import patch, Mock, MagicMock
from bs4 import BeautifulSoup
from substack_api.newsletter import (
get_newsletter_post_metadata,
get_newsletter_recommendations,
get_post_contents,
HEADERS,
)


class TestGetNewsletterPostMetadata(unittest.TestCase):
@patch("requests.get")
def test_get_newsletter_post_metadata_slugs_only(self, mock_get):
mock_get.return_value = Mock(ok=True)
mock_get.return_value.json.return_value = [
{"id": 1, "slug": "post-1"},
{"id": 2, "slug": "post-2"},
]

result = get_newsletter_post_metadata("test_subdomain", slugs_only=True)
self.assertEqual(result, ["post-1", "post-2"])

@patch("requests.get")
def test_get_newsletter_post_metadata_all_metadata(self, mock_get):
mock_get.return_value = Mock(ok=True)
mock_get.return_value.json.return_value = [
{"id": 1, "slug": "post-1", "title": "Post 1"},
{"id": 2, "slug": "post-2", "title": "Post 2"},
]

result = get_newsletter_post_metadata("test_subdomain", slugs_only=False)
self.assertEqual(
result,
[
{"id": 1, "slug": "post-1", "title": "Post 1"},
{"id": 2, "slug": "post-2", "title": "Post 2"},
],
)

@patch("requests.get")
def test_get_newsletter_post_metadata_pagination(self, mock_get):
mock_get.side_effect = [
Mock(
ok=True,
json=Mock(
return_value=[
{"id": 1, "slug": "post-1"},
{"id": 2, "slug": "post-2"},
]
),
),
Mock(
ok=True,
json=Mock(
return_value=[
{"id": 3, "slug": "post-3"},
{"id": 4, "slug": "post-4"},
]
),
),
]

result = get_newsletter_post_metadata(
"test_subdomain", slugs_only=True, start_offset=0, end_offset=20
)
self.assertEqual(result, ["post-1", "post-2", "post-3", "post-4"])

@patch("requests.get")
def test_get_newsletter_post_metadata_no_posts(self, mock_get):
mock_get.return_value = Mock(ok=True)
mock_get.return_value.json.return_value = []

result = get_newsletter_post_metadata("test_subdomain")
self.assertEqual(result, [])


class TestGetNewsletterRecommendations(unittest.TestCase):
@patch("requests.get")
@patch.object(BeautifulSoup, "find_all")
@patch.object(BeautifulSoup, "__init__", return_value=None)
def test_get_newsletter_recommendations(
self, mock_bs_init, mock_find_all, mock_get
):
mock_get.return_value = Mock(ok=True)
mock_get.return_value.text = "mocked_html"

mock_div = MagicMock()
mock_div.find.return_value = {"href": "https://mocked_url.com?param=value"}

mock_find_all.side_effect = [
[mock_div, mock_div], # div_elements
[Mock(text="title1"), Mock(text="title2")], # titles
]

result = get_newsletter_recommendations("test_subdomain")

self.assertEqual(
result,
[
{"title": "title1", "url": "https://mocked_url.com"},
{"title": "title2", "url": "https://mocked_url.com"},
],
)

mock_get.assert_called_once_with(
"https://test_subdomain.substack.com/recommendations", headers=HEADERS
)
mock_bs_init.assert_called_once_with("mocked_html", "html.parser")
self.assertEqual(mock_find_all.call_count, 2)


class TestGetPostContents(unittest.TestCase):
@patch("requests.get")
def test_get_post_contents_html_only(self, mock_get):
mock_get.return_value = Mock(ok=True)
mock_get.return_value.json.return_value = {
"body_html": "<html><body>Test post</body></html>"
}

result = get_post_contents("test_subdomain", "test_slug", html_only=True)
self.assertEqual(result, "<html><body>Test post</body></html>")

@patch("requests.get")
def test_get_post_contents_all_metadata(self, mock_get):
mock_get.return_value = Mock(ok=True)
mock_get.return_value.json.return_value = {
"body_html": "<html><body>Test post</body></html>",
"title": "Test post",
"author": "Test author",
"date": "2022-01-01",
}

result = get_post_contents("test_subdomain", "test_slug", html_only=False)
self.assertEqual(
result,
{
"body_html": "<html><body>Test post</body></html>",
"title": "Test post",
"author": "Test author",
"date": "2022-01-01",
},
)


if __name__ == "__main__":
unittest.main()
Loading

0 comments on commit f31a77f

Please sign in to comment.