forked from Jess3Jane/mastodon-ebooks
-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
99 lines (80 loc) · 3.2 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from mastodon import Mastodon
from os import path
from bs4 import BeautifulSoup
import re
import sys
api_base_url = "https://botsin.space"
scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses"]
usercred_secret = path.join(sys.argv[1], "usercred.secret")
# clientcred_secret = path.join(sys.argv[1], "usercred.secret")
clientcred_secret = "clientcred.secret"
corpus = path.join(sys.argv[1], "corpus.txt")
if not path.exists(clientcred_secret):
print("No clientcred.secret, registering application")
Mastodon.create_app("ebooks", api_base_url=api_base_url, to_file=clientcred_secret, scopes=scopes)
if not path.exists(usercred_secret):
print("No usercred.secret, registering application")
# email = input("Email: ")
# password = getpass("Password: ")
client = Mastodon(client_id="clientcred.secret", api_base_url=api_base_url)
# client.log_in(email, password, to_file="usercred.secret")
print("Visit this url:")
print(client.auth_request_url(scopes=scopes))
client.log_in(code=input("Secret: "), to_file=usercred_secret, scopes=scopes)
def parse_toot(toot):
if toot.spoiler_text != "": return
if toot.reblog is not None: return
if toot.visibility not in ["public", "unlisted"]: return
soup = BeautifulSoup(toot.content, "html.parser")
# pull the mentions out
# for mention in soup.select("span.h-card"):
# mention.unwrap()
# for mention in soup.select("a.u-url.mention"):
# mention.unwrap()
# we will destroy the mentions until we're ready to use them
# someday turbocat, you will talk to your sibilings
for mention in soup.select("span.h-card"):
mention.decompose()
# make all linebreaks actual linebreaks
for lb in soup.select("br"):
lb.insert_after("\n")
lb.decompose()
# make each p element its own line because sometimes they decide not to be
for p in soup.select("p"):
p.insert_after("\n")
p.unwrap()
# keep hashtags in the toots
for ht in soup.select("a.hashtag"):
ht.unwrap()
# unwrap all links (i like the bots posting links)
for link in soup.select("a"):
link.insert_after(link["href"])
link.decompose()
text = map(lambda a: a.strip(), soup.get_text().strip().split("\n"))
# next up: store this and patch markovify to take it
# return {"text": text, "mentions": mentions, "links": links}
# it's 4am though so we're not doing that now, but i still want the parser updates
return "\0".join(list(text))
def get_toots(client, id):
i = 0
toots = client.account_statuses(id)
while toots is not None and len(toots) > 0:
for toot in toots:
t = parse_toot(toot)
if t != None:
yield t
toots = client.fetch_next(toots)
i += 1
if i%10 == 0:
print(i)
client = Mastodon(
client_id=clientcred_secret,
access_token=usercred_secret,
api_base_url=api_base_url)
me = client.account_verify_credentials()
following = client.account_following(me.id)
with open(corpus, "w+") as fp:
for f in following:
print(f.username)
for t in get_toots(client, f.id):
fp.write(t + "\n")