forked from Exceen/4chan-downloader
-
Notifications
You must be signed in to change notification settings - Fork 3
/
inb4404.py
executable file
·372 lines (299 loc) · 11.7 KB
/
inb4404.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
#!/usr/bin/env python3
import argparse
import asyncio
from base64 import b64decode
import fnmatch
import json
import os
import sys
import time
from textwrap import dedent
import urllib.error
from urllib.request import Request, urlopen
import aiohttp
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Classes
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
class DefaultOptions:
"""Store defaults for command line options."""
def __init__(self):
script_path = os.path.dirname(os.path.realpath(__file__))
# List of files to scan for thread links
self.LIST = []
# Verbosity of the terminal output
# <0 -> really quiet mode (no output at all)
# 0 -> quiet mode (errors/warnings only)
# 1 -> default mode (0 + basic progress information)
self.VERBOSITY = 1
# Base directory
self.PATH = os.path.join(script_path, "downloads")
# Whether to use the original filenames or UNIX timestamps
# True -> original filenames
# False -> UNIX timestamps
self.USE_NAMES = False
# Path to an archive file (holds MD5 hashes of downloaded files)
self.ARCHIVE = None
# How many connections to use with aiohttp's ClientSession
self.CONNECTIONS = 10
# How often to retry a thread (!) if errors occur
# N>0 -> retry N times
# N=0 -> disable
# N<0 -> retry indefinitely (not recommended)
self.RETRIES = 5
class CustomArgumentParser(argparse.ArgumentParser):
"""Override ArgumentParser's automatic help text."""
def format_help(self):
"""Return custom help text."""
help_text = dedent(f"""\
A4A is a Python script to download all files from 4chan(nel) threads.
Usage: {self.prog} [OPTIONS] THREAD [THREAD]...
{self.prog} [OPTIONS] -l LIST [-l LIST]...
Thread:
4chan(nel) thread URL
Options:
-h, --help show help
-l, --list LIST read thread links from file
-q, --quiet suppress non-error output
-p, --path PATH set output directory (def: {self.get_default("base_dir")})
-f, --filenames use original filenames instead of UNIX timestamps
-a, --archive FILE keep track of downloaded files by logging MD5 hashes
--connections N number of connections to use (def: {self.get_default("connections")})
--retries N how often to retry a thread if errors occur (def: {self.get_default("retries")})
N<0 to retry indefinitely (not recommended)
""")
return help_text
class DownloadableThread:
"""Store thread-related information and handle its processing."""
def __init__(self, position, link):
"""Initialize thread object."""
self.count = 0
self.files = []
self.pos = position
self.link = link.split("#")[0]
info = link.partition(".org/")[2]
# info has the form <board>/thread/<thread> or <board>/thread/<thread>/<dir name>
if len(info.split("/")) > 3:
self.board, _, self.id, self.dir = info.split("/")
else:
self.board, _, self.id = info.split("/")
self.dir = self.id
resp_json = self.get_json()
if not resp_json:
return
self.files = [
{
'link': f"https://i.4cdn.org/{self.board}/{p['tim']}{p['ext']}",
'name': f"{p['filename'] if opts.names else p['tim']}{p['ext']}",
'md5': b64decode(p['md5']).hex(),
} for p in resp_json['posts'] if 'tim' in p
]
def resolve_path(self):
"""Assemble final output path and change the working directory."""
# This is the fixed directory template
out_dir = os.path.join(opts.base_dir, self.board, self.dir)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
os.chdir(out_dir)
def get_json(self):
"""Contact 4chan's API to get the names of all files in a thread."""
api_call = f"https://a.4cdn.org/{self.board}/thread/{self.id}.json"
# Custom header value is necessary to avoid 403 errors on 4chan.org
# 4channel works just fine without
req = Request(api_call, headers={'User-Agent': '4chan Archiver'})
resp_json = None
for _ in range(2):
try:
with urlopen(req) as resp:
resp_json = resp.read()
resp_json = json.loads(resp_json)
break
except urllib.error.HTTPError:
time.sleep(5)
continue
except urllib.error.URLError:
if self.pos == 1:
err("Couldn't establish connection!")
else:
err("Lost connection!")
sys.exit(1)
return resp_json
def fetch_progress(self):
"""Return thread-wise and file-wise progress."""
threads = len(opts.thread)
files = len(self.files)
t_width = len(str(threads))
f_width = len(str(files))
t_progress = f"[{self.pos: >{t_width}}/{threads}]"
f_progress = f"[{self.count: >{f_width}}/{files}]"
if self.count:
progress = f"{t_progress} {f_progress}"
else:
progress = t_progress
return progress
async def get_file(self, link, name, md5, session):
"""Download a single file."""
if os.path.exists(name) or md5 in opts.archived_md5:
self.count += 1
return
async with session.get(link) as media:
# Open file initially with .part suffix
with open(f"{name}.part", "wb") as f:
while True:
chunk = await media.content.read(1024)
if not chunk:
break
f.write(chunk)
# Remove .part suffix once complete
# After this point file won't get removed if script gets interrupted
os.rename(f"{name}.part", name)
if opts.archive:
log_hash(md5)
self.count += 1
msg(f"{self.fetch_progress()} {self.board}/{self.dir}/{name}")
async def download(self):
"""Download a thread."""
if not self.files:
# In this case the progress line gets printed to stderr
err(f"{self.fetch_progress()} {self.link}")
err(f"Thread 404'd!")
return
msg(f"{self.fetch_progress()} {self.link}")
# Retries imply attempts after the first try failed
# So the max. number of attempts is opts.retries+1
attempt = 0
while attempt <= opts.retries or opts.retries < 0:
if attempt > 0:
err(f"Retrying... ({attempt} out of "
f"{opts.retries if opts.retries > 0 else 'Inf'} attempts)")
time.sleep(5)
try:
tout = aiohttp.ClientTimeout(total=None)
conn = aiohttp.TCPConnector(limit=opts.connections)
async with aiohttp.ClientSession(timeout=tout, connector=conn) as session:
tasks = [self.get_file(f['link'], f['name'], f['md5'], session)
for f in self.files]
await asyncio.gather(*tasks)
# Leave attempt loop early if all files were downloaded successfully
break
except aiohttp.ClientConnectionError:
err("Lost connection!")
self.count = 0
attempt += 1
except aiohttp.ClientPayloadError:
err("Malformed or missing chunk!")
self.count = 0
attempt += 1
finally:
clean()
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Functions
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def err(*args, level=0, **kwargs):
"""Print to stderr."""
if level <= opts.verbosity:
print(f"[{time.strftime('%X')}]", *args, file=sys.stderr, **kwargs)
def msg(*args, level=1, **kwargs):
"""Print to stdout."""
if level <= opts.verbosity:
print(f"[{time.strftime('%X')}]", *args, **kwargs)
def positive_int(string):
"""Convert string provided by argparse to a positive int."""
try:
value = int(string)
if value <= 0:
raise ValueError
except ValueError:
error = f"invalid positive int value: {string}"
raise argparse.ArgumentTypeError(error)
return value
def valid_list(string):
"""Convert string provided by argparse to list path."""
path = os.path.abspath(string)
try:
with open(path, "r") as f:
_ = f.read(1)
except FileNotFoundError:
raise argparse.ArgumentTypeError(f"{path} does not exist!")
except (OSError, UnicodeError):
raise argparse.ArgumentTypeError(f"{path} is not a valid text file!")
return path
def valid_archive(string):
"""Convert string provided by argparse to an archive path."""
path = os.path.abspath(string)
try:
with open(path, "r") as f:
_ = f.read(1)
except FileNotFoundError:
pass
except (OSError, UnicodeError):
raise argparse.ArgumentTypeError(f"{path} is not a valid archive!")
return path
def parse_cli():
"""Parse the command line arguments with argparse."""
defaults = DefaultOptions()
parser = CustomArgumentParser(usage="%(prog)s [OPTIONS] THREAD [THREAD]...")
parser.add_argument("thread", nargs="*", help="thread URL")
parser.add_argument(
"-l", "--list", action="append", type=valid_list, default=defaults.LIST
)
parser.add_argument(
"-q", "--quiet", dest="verbosity", action="store_const",
const=0, default=defaults.VERBOSITY
)
parser.add_argument("-p", "--path", dest="base_dir", default=defaults.PATH)
parser.add_argument(
"-f", "--filenames", dest="names", action="store_true",
default=defaults.USE_NAMES
)
parser.add_argument(
"-a", "--archive", dest="archive", type=valid_archive,
default=defaults.ARCHIVE
)
parser.add_argument(
"--connections", type=positive_int, default=defaults.CONNECTIONS
)
parser.add_argument("--retries", type=int, default=defaults.RETRIES)
args = parser.parse_args()
# Scan lists for thread links
for l in args.list:
with open(l, "r") as f:
args.thread.extend([t.strip() for t in f if not t.startswith("#")])
# Make sure base_dir is an absolute path
args.base_dir = os.path.abspath(args.base_dir)
# Weed out clearly wrong thread URLs
args.thread = set(fnmatch.filter(args.thread, "*boards.4chan*.org/*/thread/*"))
return args
def reload_archive():
"""Re-read archive for each new thread."""
if not (opts.archive and os.path.exists(opts.archive)):
content = set()
else:
with open(opts.archive, "r") as f:
content = {l.strip() for l in f}
return content
def log_hash(md5):
"""Log file's hash in the archive."""
with open(opts.archive, "a") as f:
print(md5, file=f)
def clean():
"""Clean output directory of any partially downloaded (.part) files."""
for f in [f for f in os.listdir() if f.endswith(".part")]:
os.remove(f)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Main Function
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def main():
"""Run the main function body."""
for i, url in enumerate(opts.thread, start=1):
opts.archived_md5 = reload_archive()
thread = DownloadableThread(i, url)
thread.resolve_path()
asyncio.run(thread.download(), debug=False)
if __name__ == '__main__':
opts = parse_cli()
if not opts.thread:
err("No thread links specified!")
try:
main()
except KeyboardInterrupt:
err("User interrupt!")