Skip to content

Commit

Permalink
[FEATURE] Add option to download only image urls (#25)
Browse files Browse the repository at this point in the history
* feat: add `url_only` in `DownloadConfig`

* test: add url_only random test

* docs: update README
  • Loading branch information
CWHer authored Sep 25, 2024
1 parent ee39b7d commit b48f423
Show file tree
Hide file tree
Showing 12 changed files with 96 additions and 18 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,16 @@ The configurations locate at [`config.py`](./pixiv_utils/pixiv_crawler/config.py

- `download_config.with_tag: bool`: Whether to download image tags to `tags.json`. :warning:

- `download_config.url_only: bool`: Whether to download image URLs only, without downloading images. URL will be returned through `app.run()`. :warning:

```python
...
download_config.url_only = True

...
urls = app.run() # a set of image URLs
```

- `download_config.num_threads: int`: The number of threads for parallel download :warning:

- `download_config.thread_delay: float`: The delay for each thread to start.
Expand Down
12 changes: 11 additions & 1 deletion README_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,17 @@ if __name__ == "__main__":

- `download_config.store_path: str`: 存储下载图像的路径 :warning:

- `download_config.with_tag: bool`: 是否将图片标签下载到 `tags.json` 中:
- `download_config.with_tag: bool`: 是否将图片标签下载到 `tags.json` 中 :warning:

- `download_config.url_only: bool`: 是否仅下载图片链接,不下载图片文件,结果由`app.run()`返回。:warning:

```python
...
download_config.url_only = True

...
urls = app.run() # a set of image URLs
```

- `download_config.num_threads: int`: 并行下载的线程数 :warning:

Expand Down
1 change: 1 addition & 0 deletions pixiv_utils/pixiv_crawler/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ class DownloadConfig:
fail_delay: float = 1 # Waiting time (s) after failure
store_path: str = "images" # Image save path
with_tag: bool = True # Whether to download tags to a separate json file
url_only: bool = False # Only download artwork urls
num_threads: int = 12 # Number of parallel threads
thread_delay: float = 1 # Waiting time (s) after thread start

Expand Down
10 changes: 8 additions & 2 deletions pixiv_utils/pixiv_crawler/crawlers/bookmark_crawler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import concurrent.futures as futures
import functools
import time
from typing import Set
from typing import Set, Union

import requests
import tqdm
Expand Down Expand Up @@ -120,7 +120,13 @@ def collect(self, artworks_per_json: int = 48):
printInfo("===== Collect bookmark complete =====")
printInfo(f"Number of downloadable artworks: {len(self.collector.id_group)}")

def run(self):
def run(self) -> Union[Set[str], float]:
"""
Run the bookmark crawler
Returns:
Union[Set[str], float]: artwork urls or download traffic usage
"""
self._requestCount()
self.collect()
self.collector.collect()
Expand Down
10 changes: 8 additions & 2 deletions pixiv_utils/pixiv_crawler/crawlers/keyword_crawler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import concurrent.futures as futures
import functools
import urllib.parse as urlparse
from typing import Set
from typing import Set, Union

import tqdm

Expand Down Expand Up @@ -91,7 +91,13 @@ def collect(self, artworks_per_json: int = 60):
printInfo(f"===== Collect {self.keyword} complete =====")
printInfo(f"Number of downloadable artworks: {len(self.collector.id_group)}")

def run(self):
def run(self) -> Union[Set[str], float]:
"""
Run the keyword crawler
Returns:
Union[Set[str], float]: artwork urls or download traffic usage
"""
self.collect()
self.collector.collect()
return self.downloader.download()
10 changes: 8 additions & 2 deletions pixiv_utils/pixiv_crawler/crawlers/ranking_crawler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import concurrent.futures as futures
import datetime
import re
from typing import Set
from typing import Set, Union

import tqdm

Expand Down Expand Up @@ -92,7 +92,13 @@ def addDate(current: datetime.date, days):

printInfo(f"===== Collect {content} ranking complete =====")

def run(self) -> float:
def run(self) -> Union[Set[str], float]:
"""
Run the ranking crawler
Returns:
Union[Set[str], float]: artwork urls or download traffic usage
"""
self._collect()
self.collector.collect()
return self.downloader.download()
10 changes: 9 additions & 1 deletion pixiv_utils/pixiv_crawler/crawlers/users_crawler.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Set, Union

from pixiv_utils.pixiv_crawler.collector import Collector, collect, selectUser
from pixiv_utils.pixiv_crawler.config import user_config
from pixiv_utils.pixiv_crawler.downloader import Downloader
Expand Down Expand Up @@ -34,7 +36,13 @@ def collect(self):
self.collector.add(image_ids)
printInfo(f"===== Collect user {self.artist_id} complete =====")

def run(self):
def run(self) -> Union[Set[str], float]:
"""
Run the user crawler
Returns:
Union[Set[str], float]: artwork urls or download traffic usage
"""
self.collect()
self.collector.collect()
return self.downloader.download()
11 changes: 9 additions & 2 deletions pixiv_utils/pixiv_crawler/downloader/downloader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import concurrent.futures as futures
from typing import Iterable, Set
from typing import Iterable, Set, Union

import tqdm

Expand Down Expand Up @@ -29,7 +29,14 @@ def add(self, urls: Iterable[str]):
for url in urls:
self.url_group.add(url)

def download(self):
def download(self) -> Union[Set[str], float]:
"""
Returns:
Union[Set[str], float]: artwork urls or download traffic usage
"""
if download_config.url_only:
return self.url_group

download_traffic = 0.0
printInfo("===== Downloader start =====")

Expand Down
10 changes: 8 additions & 2 deletions tests/test_pixiv_crawler/test_bookmark_crawler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import random
import shutil
import unittest

Expand Down Expand Up @@ -34,13 +35,18 @@ def test_run(self):
user_config.cookie = cookie
user_config.user_id = uid
download_config.with_tag = False
download_config.url_only = random.choice([True, False])

checkDir(download_config.store_path)
app = BookmarkCrawler(n_images=5, capacity=10)
app.run()
result = app.run()

self.assertGreater(len(app.downloader.url_group), 20)
self.assertGreater(len(os.listdir(download_config.store_path)), 5)
if download_config.url_only:
self.assertEqual(result, app.downloader.url_group)
self.assertEqual(len(os.listdir(download_config.store_path)), 0)
else:
self.assertGreater(len(os.listdir(download_config.store_path)), 5)


if __name__ == "__main__":
Expand Down
10 changes: 8 additions & 2 deletions tests/test_pixiv_crawler/test_keyword_crawler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import random
import shutil
import unittest

Expand Down Expand Up @@ -30,6 +31,7 @@ def test_run(self):
user_config.user_id = ""
user_config.cookie = ""
download_config.with_tag = True
download_config.url_only = random.choice([True, False])

checkDir(download_config.store_path)
app = KeywordCrawler(
Expand All @@ -39,11 +41,15 @@ def test_run(self):
n_images=5,
capacity=10,
)
app.run()
result = app.run()

self.assertGreater(len(app.downloader.url_group), 20)
self.assertTrue("tags.json" in os.listdir(download_config.store_path))
self.assertGreater(len(os.listdir(download_config.store_path)), 5)
if download_config.url_only:
self.assertEqual(result, app.downloader.url_group)
self.assertEqual(len(os.listdir(download_config.store_path)), 1)
else:
self.assertGreater(len(os.listdir(download_config.store_path)), 5)


if __name__ == "__main__":
Expand Down
10 changes: 8 additions & 2 deletions tests/test_pixiv_crawler/test_ranking_crawler.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import os
import random
import shutil
import unittest

Expand Down Expand Up @@ -32,6 +33,7 @@ def test_run(self):
user_config.user_id = ""
user_config.cookie = ""
download_config.with_tag = False
download_config.url_only = random.choice([True, False])
ranking_config.start_date = datetime.date(2024, 5, 1)
ranking_config.range = 2
ranking_config.mode = "weekly"
Expand All @@ -40,10 +42,14 @@ def test_run(self):

checkDir(download_config.store_path)
app = RankingCrawler(capacity=10)
app.run()
result = app.run()

self.assertGreater(len(app.downloader.url_group), 50)
self.assertGreater(len(os.listdir(download_config.store_path)), 5)
if download_config.url_only:
self.assertEqual(result, app.downloader.url_group)
self.assertEqual(len(os.listdir(download_config.store_path)), 0)
else:
self.assertGreater(len(os.listdir(download_config.store_path)), 5)


if __name__ == "__main__":
Expand Down
10 changes: 8 additions & 2 deletions tests/test_pixiv_crawler/test_user_crawler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import random
import shutil
import unittest

Expand Down Expand Up @@ -30,14 +31,19 @@ def test_run(self):
user_config.user_id = ""
user_config.cookie = ""
download_config.with_tag = False
download_config.url_only = random.choice([True, False])
download_config.num_threads = 10

checkDir(download_config.store_path)
app = UserCrawler(artist_id="32548944", capacity=10)
app.run()
result = app.run()

self.assertGreater(len(app.downloader.url_group), 200)
self.assertGreater(len(os.listdir(download_config.store_path)), 5)
if download_config.url_only:
self.assertEqual(result, app.downloader.url_group)
self.assertEqual(len(os.listdir(download_config.store_path)), 0)
else:
self.assertGreater(len(os.listdir(download_config.store_path)), 5)


if __name__ == "__main__":
Expand Down

0 comments on commit b48f423

Please sign in to comment.