[FEATURE] Add option to download only image urls (#25)

* feat: add `url_only` in `DownloadConfig` * test: add url_only random test * docs: update README
CWHer · Sep 25, 2024 · b48f423 · b48f423
1 parent ee39b7d
commit b48f423
Show file tree

Hide file tree

Showing 12 changed files with 96 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -180,6 +180,16 @@ The configurations locate at [`config.py`](./pixiv_utils/pixiv_crawler/config.py
 
  - `download_config.with_tag: bool`: Whether to download image tags to `tags.json`. :warning:
 
+ - `download_config.url_only: bool`: Whether to download image URLs only, without downloading images. URL will be returned through `app.run()`. :warning:
+
+ ```python
+ ...
+ download_config.url_only = True
+
+ ...
+ urls = app.run() # a set of image URLs
+ ```
+
  - `download_config.num_threads: int`: The number of threads for parallel download :warning:
 
  - `download_config.thread_delay: float`: The delay for each thread to start.

diff --git a/README_CN.md b/README_CN.md
@@ -171,7 +171,17 @@ if __name__ == "__main__":
 
  - `download_config.store_path: str`： 存储下载图像的路径 :warning：
 
- - `download_config.with_tag: bool`： 是否将图片标签下载到 `tags.json` 中：
+ - `download_config.with_tag: bool`： 是否将图片标签下载到 `tags.json` 中 :warning:
+
+ - `download_config.url_only: bool`： 是否仅下载图片链接，不下载图片文件，结果由`app.run()`返回。:warning:
+
+ ```python
+ ...
+ download_config.url_only = True
+
+ ...
+ urls = app.run() # a set of image URLs
+ ```
 
  - `download_config.num_threads: int`： 并行下载的线程数 :warning：
 

diff --git a/pixiv_utils/pixiv_crawler/config.py b/pixiv_utils/pixiv_crawler/config.py
@@ -75,6 +75,7 @@ class DownloadConfig:
  fail_delay: float = 1 # Waiting time (s) after failure
  store_path: str = "images" # Image save path
  with_tag: bool = True # Whether to download tags to a separate json file
+ url_only: bool = False # Only download artwork urls
  num_threads: int = 12 # Number of parallel threads
  thread_delay: float = 1 # Waiting time (s) after thread start
 

diff --git a/pixiv_utils/pixiv_crawler/crawlers/bookmark_crawler.py b/pixiv_utils/pixiv_crawler/crawlers/bookmark_crawler.py
@@ -1,7 +1,7 @@
 import concurrent.futures as futures
 import functools
 import time
-from typing import Set
+from typing import Set, Union
 
 import requests
 import tqdm
@@ -120,7 +120,13 @@ def collect(self, artworks_per_json: int = 48):
  printInfo("===== Collect bookmark complete =====")
  printInfo(f"Number of downloadable artworks: {len(self.collector.id_group)}")
 
- def run(self):
+ def run(self) -> Union[Set[str], float]:
+ """
+ Run the bookmark crawler
+
+ Returns:
+ Union[Set[str], float]: artwork urls or download traffic usage
+ """
  self._requestCount()
  self.collect()
  self.collector.collect()

diff --git a/pixiv_utils/pixiv_crawler/crawlers/keyword_crawler.py b/pixiv_utils/pixiv_crawler/crawlers/keyword_crawler.py
@@ -1,7 +1,7 @@
 import concurrent.futures as futures
 import functools
 import urllib.parse as urlparse
-from typing import Set
+from typing import Set, Union
 
 import tqdm
 
@@ -91,7 +91,13 @@ def collect(self, artworks_per_json: int = 60):
  printInfo(f"===== Collect {self.keyword} complete =====")
  printInfo(f"Number of downloadable artworks: {len(self.collector.id_group)}")
 
- def run(self):
+ def run(self) -> Union[Set[str], float]:
+ """
+ Run the keyword crawler
+
+ Returns:
+ Union[Set[str], float]: artwork urls or download traffic usage
+ """
  self.collect()
  self.collector.collect()
  return self.downloader.download()
diff --git a/pixiv_utils/pixiv_crawler/crawlers/ranking_crawler.py b/pixiv_utils/pixiv_crawler/crawlers/ranking_crawler.py
@@ -1,7 +1,7 @@
 import concurrent.futures as futures
 import datetime
 import re
-from typing import Set
+from typing import Set, Union
 
 import tqdm
 
@@ -92,7 +92,13 @@ def addDate(current: datetime.date, days):
 
  printInfo(f"===== Collect {content} ranking complete =====")
 
- def run(self) -> float:
+ def run(self) -> Union[Set[str], float]:
+ """
+ Run the ranking crawler
+
+ Returns:
+ Union[Set[str], float]: artwork urls or download traffic usage
+ """
  self._collect()
  self.collector.collect()
  return self.downloader.download()
diff --git a/pixiv_utils/pixiv_crawler/crawlers/users_crawler.py b/pixiv_utils/pixiv_crawler/crawlers/users_crawler.py
@@ -1,3 +1,5 @@
+from typing import Set, Union
+
 from pixiv_utils.pixiv_crawler.collector import Collector, collect, selectUser
 from pixiv_utils.pixiv_crawler.config import user_config
 from pixiv_utils.pixiv_crawler.downloader import Downloader
@@ -34,7 +36,13 @@ def collect(self):
  self.collector.add(image_ids)
  printInfo(f"===== Collect user {self.artist_id} complete =====")
 
- def run(self):
+ def run(self) -> Union[Set[str], float]:
+ """
+ Run the user crawler
+
+ Returns:
+ Union[Set[str], float]: artwork urls or download traffic usage
+ """
  self.collect()
  self.collector.collect()
  return self.downloader.download()
diff --git a/pixiv_utils/pixiv_crawler/downloader/downloader.py b/pixiv_utils/pixiv_crawler/downloader/downloader.py
@@ -1,5 +1,5 @@
 import concurrent.futures as futures
-from typing import Iterable, Set
+from typing import Iterable, Set, Union
 
 import tqdm
 
@@ -29,7 +29,14 @@ def add(self, urls: Iterable[str]):
  for url in urls:
  self.url_group.add(url)
 
- def download(self):
+ def download(self) -> Union[Set[str], float]:
+ """
+ Returns:
+ Union[Set[str], float]: artwork urls or download traffic usage
+ """
+ if download_config.url_only:
+ return self.url_group
+
  download_traffic = 0.0
  printInfo("===== Downloader start =====")
 

diff --git a/tests/test_pixiv_crawler/test_bookmark_crawler.py b/tests/test_pixiv_crawler/test_bookmark_crawler.py
@@ -1,4 +1,5 @@
 import os
+import random
 import shutil
 import unittest
 
@@ -34,13 +35,18 @@ def test_run(self):
  user_config.cookie = cookie
  user_config.user_id = uid
  download_config.with_tag = False
+ download_config.url_only = random.choice([True, False])
 
  checkDir(download_config.store_path)
  app = BookmarkCrawler(n_images=5, capacity=10)
- app.run()
+ result = app.run()
 
  self.assertGreater(len(app.downloader.url_group), 20)
- self.assertGreater(len(os.listdir(download_config.store_path)), 5)
+ if download_config.url_only:
+ self.assertEqual(result, app.downloader.url_group)
+ self.assertEqual(len(os.listdir(download_config.store_path)), 0)
+ else:
+ self.assertGreater(len(os.listdir(download_config.store_path)), 5)
 
 
 if __name__ == "__main__":

diff --git a/tests/test_pixiv_crawler/test_keyword_crawler.py b/tests/test_pixiv_crawler/test_keyword_crawler.py
@@ -1,4 +1,5 @@
 import os
+import random
 import shutil
 import unittest
 
@@ -30,6 +31,7 @@ def test_run(self):
  user_config.user_id = ""
  user_config.cookie = ""
  download_config.with_tag = True
+ download_config.url_only = random.choice([True, False])
 
  checkDir(download_config.store_path)
  app = KeywordCrawler(
@@ -39,11 +41,15 @@ def test_run(self):
  n_images=5,
  capacity=10,
  )
- app.run()
+ result = app.run()
 
  self.assertGreater(len(app.downloader.url_group), 20)
  self.assertTrue("tags.json" in os.listdir(download_config.store_path))
- self.assertGreater(len(os.listdir(download_config.store_path)), 5)
+ if download_config.url_only:
+ self.assertEqual(result, app.downloader.url_group)
+ self.assertEqual(len(os.listdir(download_config.store_path)), 1)
+ else:
+ self.assertGreater(len(os.listdir(download_config.store_path)), 5)
 
 
 if __name__ == "__main__":

diff --git a/tests/test_pixiv_crawler/test_ranking_crawler.py b/tests/test_pixiv_crawler/test_ranking_crawler.py
@@ -1,5 +1,6 @@
 import datetime
 import os
+import random
 import shutil
 import unittest
 
@@ -32,6 +33,7 @@ def test_run(self):
  user_config.user_id = ""
  user_config.cookie = ""
  download_config.with_tag = False
+ download_config.url_only = random.choice([True, False])
  ranking_config.start_date = datetime.date(2024, 5, 1)
  ranking_config.range = 2
  ranking_config.mode = "weekly"
@@ -40,10 +42,14 @@ def test_run(self):
 
  checkDir(download_config.store_path)
  app = RankingCrawler(capacity=10)
- app.run()
+ result = app.run()
 
  self.assertGreater(len(app.downloader.url_group), 50)
- self.assertGreater(len(os.listdir(download_config.store_path)), 5)
+ if download_config.url_only:
+ self.assertEqual(result, app.downloader.url_group)
+ self.assertEqual(len(os.listdir(download_config.store_path)), 0)
+ else:
+ self.assertGreater(len(os.listdir(download_config.store_path)), 5)
 
 
 if __name__ == "__main__":

diff --git a/tests/test_pixiv_crawler/test_user_crawler.py b/tests/test_pixiv_crawler/test_user_crawler.py
@@ -1,4 +1,5 @@
 import os
+import random
 import shutil
 import unittest
 
@@ -30,14 +31,19 @@ def test_run(self):
  user_config.user_id = ""
  user_config.cookie = ""
  download_config.with_tag = False
+ download_config.url_only = random.choice([True, False])
  download_config.num_threads = 10
 
  checkDir(download_config.store_path)
  app = UserCrawler(artist_id="32548944", capacity=10)
- app.run()
+ result = app.run()
 
  self.assertGreater(len(app.downloader.url_group), 200)
- self.assertGreater(len(os.listdir(download_config.store_path)), 5)
+ if download_config.url_only:
+ self.assertEqual(result, app.downloader.url_group)
+ self.assertEqual(len(os.listdir(download_config.store_path)), 0)
+ else:
+ self.assertGreater(len(os.listdir(download_config.store_path)), 5)
 
 
 if __name__ == "__main__":