diff --git a/party/cli.py b/party/cli.py index a0ab9ba..82944f8 100755 --- a/party/cli.py +++ b/party/cli.py @@ -92,6 +92,10 @@ "exclusive with post_id, post_title and file_format" ) +size_limit_option = typer.Option( + help="Allows for a size limit, in Megabytes, as a cut off for downloaded " + "files. Example: if 50, no files larger than 50Mb will be downloaded." +) file_format_option = typer.Option( help="Used to set the output file format. " "Mutually exclusive with post_id, post_title and ordered short. " @@ -117,6 +121,7 @@ def pull_user( post_title: Annotated[bool, post_title_option] = False, ordered_short: Annotated[bool, ordered_short_option] = False, file_format: Annotated[str, file_format_option] = "{ref.filename}", + size_limit: Annotated[int, size_limit_option] = -1, sluglify: bool = False, full_check: bool = False, ): @@ -160,6 +165,7 @@ def pull_user( ordered_short=ordered_short, file_format=file_format, sluglify=sluglify, + size_limit=size_limit, ) update_csluglify(sluglify) @@ -209,7 +215,8 @@ def pull_user( typer.secho(f"Downloading from user: {user.name}", fg=typer.colors.MAGENTA) with tqdm(total=len(files)) as pbar: output = asyncio.run( - download_async(pbar, site, directory, files, workers, full_check) + download_async(pbar, site, directory, files, workers, full_check, + size_limit) ) write_etags(directory) count = Counter(output) @@ -223,6 +230,7 @@ async def download_async( files, workers: int = 10, full_check: bool = False, + size_limit: int = -1, ): """Basic AsyncIO implementation of downloads for files""" timeout = aiohttp.ClientTimeout(60 * 60, sock_connect=30) @@ -250,7 +258,7 @@ async def download(file, semaphore): filename = f"{directory}/{file.filename}" async with semaphore: status = await file.download( - session, filename, 0, full_check + session, filename, 0, full_check, size_limit ) if status == StatusEnum.ERROR_429 and workers > 1: workers = workers - 1 diff --git a/party/common.py b/party/common.py index 5a2986c..af324c7 100644 --- a/party/common.py +++ b/party/common.py @@ -56,6 +56,7 @@ class StatusEnum(Enum): EXISTS = 5 ERROR_OSERROR = 6 DUPLICATE = 7 + TOO_LARGE = 8 def generate_token(size=16): diff --git a/party/posts.py b/party/posts.py index 6c96d98..1060816 100644 --- a/party/posts.py +++ b/party/posts.py @@ -14,7 +14,11 @@ import aiohttp import desert -from aiohttp import ClientPayloadError, ServerTimeoutError, ClientConnectorError +from aiohttp import ( + ClientPayloadError, + ServerTimeoutError, + ClientConnectorError, +) from dateutil.parser import parse from loguru import logger from tqdm import tqdm @@ -122,6 +126,7 @@ async def download( filename: str = ".", retries: int = 0, full_check: bool = False, + cut_off: int = -1, ): """Async download handler""" status = StatusEnum.SUCCESS @@ -138,6 +143,8 @@ async def download( } try: async with session.head(url, allow_redirects=True) as head: + size_in_mb = (int(head.headers["content-length"])/1024/1024) \ + if 'content-length' in head.headers else 1 if head.status == 429: return StatusEnum.ERROR_429 try: @@ -149,6 +156,12 @@ async def download( return StatusEnum.ERROR_OTHER if etag_exists(tag) and not os.path.exists(filename): return StatusEnum.DUPLICATE + if ( + cut_off > 0 + and "content-length" in head.headers + and cut_off < size_in_mb + ): + return StatusEnum.TOO_LARGE add_etag(tag) async with session.get(url, headers=headers) as resp: @@ -178,7 +191,11 @@ async def download( ) fbar.refresh() fbar.close() - except (ClientPayloadError, ServerTimeoutError, ClientConnectorError) as err: + except ( + ClientPayloadError, + ServerTimeoutError, + ClientConnectorError, + ) as err: logger.debug( { "error": err, @@ -226,17 +243,19 @@ async def download( {"error": err, "filename": filename, "url": self.path} ) status = StatusEnum.ERROR_OTHER - except (ConnectTimeoutError, ServerTimeoutError, ClientConnectorError) as err: + except ( + ConnectTimeoutError, + ServerTimeoutError, + ClientConnectorError, + ) as err: logger.debug( {"error": err, "filename": filename, "url": self.path} ) if retries < 2: - status = await self.download( - session, filename, retries + 1 - ) + status = await self.download(session, filename, retries + 1) else: status = StatusEnum.ERROR_TIMEOUT - if 'tag' in locals(): + if "tag" in locals(): remove_etag(tag) return status