From 758edc9292650f036459b4d78bc18369cf4f14a2 Mon Sep 17 00:00:00 2001 From: enduser420 Date: Mon, 27 Jun 2022 17:46:28 +0530 Subject: [PATCH 001/252] [2chen] Add 2chen.moe extractor --- gallery_dl/extractor/2chen.py | 90 ++++++++++++++++++++++++++++++++ gallery_dl/extractor/__init__.py | 1 + 2 files changed, 91 insertions(+) create mode 100644 gallery_dl/extractor/2chen.py diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py new file mode 100644 index 0000000000..27c5048248 --- /dev/null +++ b/gallery_dl/extractor/2chen.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from .common import Extractor, Message +from .. import text + + +class _2chenThreadExtractor(Extractor): + """Extractor for 2chen threads""" + category = "2chen" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{filename}" + pattern = (r"(?:https?://)?2chen\.moe" + r"/([^/]+)/(\d+)") + test = ("https://2chen.moe/jp/303786",) + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "https://2chen.moe/{}/{}".format(self.board, self.thread) + page = self.request(url, encoding="utf-8").text + data = self.metadata(page) + yield Message.Directory, data + for post in self.posts(page): + if post["url"] == None or post["filename"] == None: + continue + url = "https://2chen.moe{}".format(post["url"]) + yield Message.Url, url, post + + def metadata(self, page): + title = text.extract(page, "

", "

")[0] + return { + "board": self.board, + "thread": self.thread, + "title": title + } + + def posts(self, page): + posts = text.extract_iter( + page, '
', '
') + return [self.parse(post) for post in posts] + + def parse(self, post): + data = self._extract_post(post) + data["extension"] = str(data["filename"]).split(".")[-1] + return data + + @staticmethod + def _extract_post(post): + return text.extract_all(post, ( + ('url', '') diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index e273f8433f..1b6d4ec651 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -10,6 +10,7 @@ modules = [ "2chan", + "2chen", "35photo", "3dbooru", "420chan", From f1d15e2aa3d9ec971746aeaa1474ad6b16a253f7 Mon Sep 17 00:00:00 2001 From: enduser420 Date: Mon, 27 Jun 2022 17:58:26 +0530 Subject: [PATCH 002/252] change "==" to is --- gallery_dl/extractor/2chen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py index 27c5048248..4b5c99de64 100644 --- a/gallery_dl/extractor/2chen.py +++ b/gallery_dl/extractor/2chen.py @@ -30,7 +30,7 @@ def items(self): data = self.metadata(page) yield Message.Directory, data for post in self.posts(page): - if post["url"] == None or post["filename"] == None: + if post["url"] is None or post["filename"] is None: continue url = "https://2chen.moe{}".format(post["url"]) yield Message.Url, url, post From 853435f4379c0b516c26c8715603f818da995da1 Mon Sep 17 00:00:00 2001 From: enduser420 Date: Mon, 27 Jun 2022 18:29:11 +0530 Subject: [PATCH 003/252] fix for "test_unique_pattern_matches" --- gallery_dl/extractor/2chen.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py index 4b5c99de64..f7cdb93a72 100644 --- a/gallery_dl/extractor/2chen.py +++ b/gallery_dl/extractor/2chen.py @@ -65,18 +65,21 @@ class _2chenBoardExtractor(Extractor): """Extractor for 2chen boards""" category = "2chen" subcategory = "board" - pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)(?:catalog)?" + pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)/?(?:catalog)?$" test = ( ("https://2chen.moe/co/", { "pattern": _2chenThreadExtractor.pattern }), + ("https://2chen.moe/co", { + "pattern": _2chenThreadExtractor.pattern + }), ("https://2chen.moe/co/catalog", { "pattern": _2chenThreadExtractor.pattern })) def __init__(self, match): Extractor.__init__(self, match) - self.board = match.group(1) + self.board = match.group() def items(self): url = "https://2chen.moe/{}/catalog".format(self.board) From a9b8a2430d29eccdc92da587e7051567d5178eeb Mon Sep 17 00:00:00 2001 From: enduser420 Date: Thu, 30 Jun 2022 19:57:44 +0530 Subject: [PATCH 004/252] [Jpgchurch] Add Jpgchurch extractor --- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/jpgchurch.py | 121 ++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 gallery_dl/extractor/jpgchurch.py diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 1b6d4ec651..d63c79344f 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -66,6 +66,7 @@ "instagram", "issuu", "itaku", + "jpgchurch", "kabeuchi", "keenspot", "kemonoparty", diff --git a/gallery_dl/extractor/jpgchurch.py b/gallery_dl/extractor/jpgchurch.py new file mode 100644 index 0000000000..e0105f26d5 --- /dev/null +++ b/gallery_dl/extractor/jpgchurch.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?jpg\.church" + + +class JpgchurchImageExtractor(Extractor): + """Base Extractor for Jpgchurch Images""" + category = "Jpgchurch" + subcategory = "image" + directory_fmt = ("{category}", "{user}") + filename_fmt = "{filename}" + pattern = BASE_PATTERN + r"/img/([\w\d\-\.]+)" + root = "https://jpg.church" + test = ("https://jpg.church/img/funnymeme.LecXGS",) + + def __init__(self, match): + Extractor.__init__(self, match) + self.image = match.group(1) + + def items(self): + data = self.metadata() + for image in self.images(): + if "album" in image or "user" in image: + data.update(image) + yield Message.Directory, data + yield Message.Url, image["url"], image + + def metadata(self): + """Return general metadata""" + return {} + + def images(self): + """Return an iterable containing the image(s)""" + url = "{}/img/{}".format(self.root, self.image) + return [self._get_images(url)] + + def _get_images(self, url): + page = self.request(url).text + data = self._extract_image(page) + data.update({ + "user": data["user"].split("/")[-1], + "extension": text.ext_from_url(data["url"]) + }) + return data + + @staticmethod + def _extract_image(page): + _page = text.extract( + page, + '
', '')))[0] + + +class JpgchurchAlbumExtractor(JpgchurchImageExtractor, Extractor): + """Extractor for Jpgchurch Albums""" + subcategory = "album" + directory_fmt = ("{category}", "{user}", "{album}",) + pattern = BASE_PATTERN + r"/a(?:lbum)?/([\w\d\-\.]+)" + test = ("https://jpg.church/album/CDilP/?sort=date_desc&page=1",) + + def __init__(self, match): + Extractor.__init__(self, match) + self.album = match.group(1).split('.')[-1] + + def metadata(self): + return {"album": self.album} + + def images(self): + url = "{}/a/{}".format(self.root, self.album) + for _url in self._get_album_images(url): + yield self._get_images(_url) + + def _pagination(self, url): + """Uses recursion to yield the next page""" + yield url + page = self.request(url).text + _next = text.extract( + page, '<')[0] + if _next: + url = _next + yield from self._pagination(_next) + + def _get_album_images(self, url): + for _url in self._pagination(url): + page = self.request(_url).text + _page = text.extract_iter( + page, '
Date: Thu, 30 Jun 2022 20:16:10 +0530 Subject: [PATCH 005/252] remove unrelated changes --- gallery_dl/extractor/2chen.py | 93 -------------------------------- gallery_dl/extractor/__init__.py | 1 - 2 files changed, 94 deletions(-) delete mode 100644 gallery_dl/extractor/2chen.py diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py deleted file mode 100644 index f7cdb93a72..0000000000 --- a/gallery_dl/extractor/2chen.py +++ /dev/null @@ -1,93 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2017-2021 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -from .common import Extractor, Message -from .. import text - - -class _2chenThreadExtractor(Extractor): - """Extractor for 2chen threads""" - category = "2chen" - subcategory = "thread" - directory_fmt = ("{category}", "{board}", "{thread} {title}") - filename_fmt = "{filename}" - pattern = (r"(?:https?://)?2chen\.moe" - r"/([^/]+)/(\d+)") - test = ("https://2chen.moe/jp/303786",) - - def __init__(self, match): - Extractor.__init__(self, match) - self.board, self.thread = match.groups() - - def items(self): - url = "https://2chen.moe/{}/{}".format(self.board, self.thread) - page = self.request(url, encoding="utf-8").text - data = self.metadata(page) - yield Message.Directory, data - for post in self.posts(page): - if post["url"] is None or post["filename"] is None: - continue - url = "https://2chen.moe{}".format(post["url"]) - yield Message.Url, url, post - - def metadata(self, page): - title = text.extract(page, "

", "

")[0] - return { - "board": self.board, - "thread": self.thread, - "title": title - } - - def posts(self, page): - posts = text.extract_iter( - page, '
', '
') - return [self.parse(post) for post in posts] - - def parse(self, post): - data = self._extract_post(post) - data["extension"] = str(data["filename"]).split(".")[-1] - return data - - @staticmethod - def _extract_post(post): - return text.extract_all(post, ( - ('url', '
') diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index d63c79344f..ebc2341acb 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -10,7 +10,6 @@ modules = [ "2chan", - "2chen", "35photo", "3dbooru", "420chan", From 7bbaf025c055954342b1ee95f2fa49cdbecb5437 Mon Sep 17 00:00:00 2001 From: enduser420 Date: Sun, 31 Jul 2022 20:28:40 +0530 Subject: [PATCH 006/252] [jpgchurch] refactor --- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/jpgchurch.py | 190 +++++++++++++++++++----------- 2 files changed, 119 insertions(+), 72 deletions(-) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index ebc2341acb..ef52593d18 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -73,6 +73,7 @@ "kissgoddess", "kohlchan", "komikcast", + "lensdump", "lightroom", "lineblog", "livedoor", diff --git a/gallery_dl/extractor/jpgchurch.py b/gallery_dl/extractor/jpgchurch.py index e0105f26d5..dc848d3aa7 100644 --- a/gallery_dl/extractor/jpgchurch.py +++ b/gallery_dl/extractor/jpgchurch.py @@ -12,75 +12,37 @@ BASE_PATTERN = r"(?:https?://)?jpg\.church" -class JpgchurchImageExtractor(Extractor): - """Base Extractor for Jpgchurch Images""" - category = "Jpgchurch" - subcategory = "image" - directory_fmt = ("{category}", "{user}") - filename_fmt = "{filename}" - pattern = BASE_PATTERN + r"/img/([\w\d\-\.]+)" +class JpgchurchExtractor(Extractor): + """Base class for Jpgchurch extractors""" + category = "jpgchurch" root = "https://jpg.church" - test = ("https://jpg.church/img/funnymeme.LecXGS",) + directory_fmt = ("{category}", "{user}",) + archive_fmt = "{filename}" def __init__(self, match): Extractor.__init__(self, match) - self.image = match.group(1) def items(self): - data = self.metadata() for image in self.images(): - if "album" in image or "user" in image: - data.update(image) - yield Message.Directory, data + yield Message.Directory, image yield Message.Url, image["url"], image - def metadata(self): - """Return general metadata""" - return {} - def images(self): """Return an iterable containing the image(s)""" - url = "{}/img/{}".format(self.root, self.image) - return [self._get_images(url)] - - def _get_images(self, url): - page = self.request(url).text - data = self._extract_image(page) - data.update({ - "user": data["user"].split("/")[-1], - "extension": text.ext_from_url(data["url"]) - }) - return data @staticmethod - def _extract_image(page): - _page = text.extract( - page, - '
', '')))[0] - - -class JpgchurchAlbumExtractor(JpgchurchImageExtractor, Extractor): - """Extractor for Jpgchurch Albums""" - subcategory = "album" - directory_fmt = ("{category}", "{user}", "{album}",) - pattern = BASE_PATTERN + r"/a(?:lbum)?/([\w\d\-\.]+)" - test = ("https://jpg.church/album/CDilP/?sort=date_desc&page=1",) - - def __init__(self, match): - Extractor.__init__(self, match) - self.album = match.group(1).split('.')[-1] - - def metadata(self): - return {"album": self.album} + def _extract_user(page): + return text.extract(page, 'username: "', '"')[0] - def images(self): - url = "{}/a/{}".format(self.root, self.album) - for _url in self._get_album_images(url): - yield self._get_images(_url) + def _extract_image(self, url): + page = self.request(url).text + data = { + "url": text.extract( + page, '')[0], + } + text.nameext_from_url(data["url"], data) + data["user"] = self._extract_user(page) + return data def _pagination(self, url): """Uses recursion to yield the next page""" @@ -92,30 +54,114 @@ def _pagination(self, url): url = _next yield from self._pagination(_next) - def _get_album_images(self, url): - for _url in self._pagination(url): - page = self.request(_url).text - _page = text.extract_iter( + def _get_images(self, url): + for url in self._pagination(url): + page = self.request(url).text + album = text.extract(page, '')[0] + album = text.extract(album, '>', '')[0] + page = text.extract_iter( + page, '
')[0] + album = text.extract(album, '>', '')[0] + page = text.extract_iter( page, '
Date: Sun, 31 Jul 2022 20:36:19 +0530 Subject: [PATCH 007/252] [jpgchurch] . --- gallery_dl/extractor/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index ef52593d18..ebc2341acb 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -73,7 +73,6 @@ "kissgoddess", "kohlchan", "komikcast", - "lensdump", "lightroom", "lineblog", "livedoor", From 3d87cedc58366898fb865028db613fea04f4d9b3 Mon Sep 17 00:00:00 2001 From: enduser420 Date: Fri, 26 Aug 2022 15:58:19 +0530 Subject: [PATCH 008/252] [jpgchurch] rework the image extractor now the image extractor can recognize if an image if from an album also removed some unnecessary methods --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/jpgchurch.py | 92 +++++++++++++------------------ scripts/supportedsites.py | 1 + 3 files changed, 46 insertions(+), 53 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 30e74dede5..46d6d92ff2 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -391,6 +391,12 @@ Consider all sites to be NSFW unless otherwise known. Galleries, individual Images + + Jpgchurch + https://jpg.church/ + Albums, individual Images, User Profiles + + Keenspot http://www.keenspot.com/ diff --git a/gallery_dl/extractor/jpgchurch.py b/gallery_dl/extractor/jpgchurch.py index dc848d3aa7..34910d108c 100644 --- a/gallery_dl/extractor/jpgchurch.py +++ b/gallery_dl/extractor/jpgchurch.py @@ -16,24 +16,19 @@ class JpgchurchExtractor(Extractor): """Base class for Jpgchurch extractors""" category = "jpgchurch" root = "https://jpg.church" - directory_fmt = ("{category}", "{user}",) - archive_fmt = "{filename}" - - def __init__(self, match): - Extractor.__init__(self, match) - - def items(self): - for image in self.images(): - yield Message.Directory, image - yield Message.Url, image["url"], image - - def images(self): - """Return an iterable containing the image(s)""" + directory_fmt = ("{category}", "{user}", "{album}",) + archive_fmt = "{user}_{filename}" @staticmethod def _extract_user(page): return text.extract(page, 'username: "', '"')[0] + @staticmethod + def _extract_album(page): + album = text.extract(page, 'Added to ', '')[0] + return album + def _extract_image(self, url): page = self.request(url).text data = { @@ -42,36 +37,22 @@ def _extract_image(self, url): } text.nameext_from_url(data["url"], data) data["user"] = self._extract_user(page) + data["album"] = self._extract_album(page) return data def _pagination(self, url): - """Uses recursion to yield the next page""" - yield url - page = self.request(url).text - _next = text.extract( - page, '<')[0] - if _next: - url = _next - yield from self._pagination(_next) - - def _get_images(self, url): - for url in self._pagination(url): + while True: + yield url page = self.request(url).text - album = text.extract(page, '')[0] - album = text.extract(album, '>', '')[0] - page = text.extract_iter( - page, '
<')[0] + if not _next: + return + url = _next def _get_albums(self, url): for url in self._pagination(url): page = self.request(url).text - album = text.extract(page, '')[0] - album = text.extract(album, '>', '')[0] page = text.extract_iter( page, '
Date: Fri, 26 Aug 2022 21:58:32 +0530 Subject: [PATCH 009/252] [jpgchurch] fix format in supportedsites.py --- scripts/supportedsites.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 9009640e0f..1743efb4ca 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -58,7 +58,7 @@ "imgth" : "imgth", "imgur" : "imgur", "joyreactor" : "JoyReactor", - "jpgchurch" : "Jpgchurch", + "jpgchurch" : "Jpgchurch", "kabeuchi" : "かべうち", "kemonoparty" : "Kemono", "kireicake" : "Kirei Cake", From 039d06c8f61f7ed0a81aabda120c29168d9003ff Mon Sep 17 00:00:00 2001 From: enduser420 Date: Sun, 13 Nov 2022 16:00:29 +0530 Subject: [PATCH 010/252] [mangaread] add 'chapter' and 'manga' extractors --- docs/supportedsites.md | 6 + gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/mangaread.py | 189 ++++++++++++++++++++++++++++++ scripts/supportedsites.py | 1 + 4 files changed, 197 insertions(+) create mode 100644 gallery_dl/extractor/mangaread.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 7d10d9690a..c8ebbec9e5 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -511,6 +511,12 @@ Consider all sites to be NSFW unless otherwise known. Chapters, Manga + + MangaRead + https://mangaread.org/ + Chapters, Manga + + MangaSee https://mangasee123.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 6998a21ea2..b62671a3f9 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -86,6 +86,7 @@ "mangakakalot", "manganelo", "mangapark", + "mangaread", "mangasee", "mangoxo", "mememuseum", diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py new file mode 100644 index 0000000000..b8cf1e88ef --- /dev/null +++ b/gallery_dl/extractor/mangaread.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://mangaread.org/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text, exception +import re + + +class MangareadBase(): + """Base class for Mangaread extractors""" + category = "mangaread" + root = "https://www.mangaread.org" + + @staticmethod + def parse_chapter_string(chapter_string, data): + match = re.match( + r"(?:(.+)\s*-\s*)?[Cc]hapter\s*(\d+)(\.\d+)?(?:\s*-\s*(.+))?", + text.unescape(chapter_string).strip()) + manga, chapter, minor, title = match.groups() + manga = manga.strip() if manga else "" + data["manga"] = data.pop("manga", manga) + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = minor or "" + data["title"] = title or "" + data["lang"] = "en" + data["language"] = "English" + + +class MangareadChapterExtractor(MangareadBase, ChapterExtractor): + """Extractor for manga-chapters from mangaread.org""" + pattern = (r"(?:https?://)?(?:www\.)?mangaread\.org" + r"(/manga/[^/?#]+/[^/?#]+)") + test = ( + ("https://www.mangaread.org/manga/one-piece/chapter-1053-3/", { + "pattern": (r"https://www\.mangaread\.org/wp-content/uploads" + r"/WP-manga/data/manga_[^/]+/[^/]+/[^.]+\.\w+"), + "count": 11, + "keyword": { + "manga" : "One Piece", + "title" : "", + "chapter" : 1053, + "chapter_minor": ".3", + "tags" : ["Oda Eiichiro"], + "lang" : "en", + "language": "English", + } + }), + ("https://www.mangaread.org/manga/one-piece/chapter-1000000/", { + "exception": exception.NotFoundError, + }), + (("https://www.mangaread.org" + "/manga/kanan-sama-wa-akumade-choroi/chapter-10/"), { + "pattern": (r"https://www\.mangaread\.org/wp-content/uploads" + r"/WP-manga/data/manga_[^/]+/[^/]+/[^.]+\.\w+"), + "count": 9, + "keyword": { + "manga" : "Kanan-sama wa Akumade Choroi", + "title" : "", + "chapter" : 10, + "chapter_minor": "", + "tags" : list, + "lang" : "en", + "language": "English", + } + }), + # 'Chapter146.5' + # ^^ no whitespace + ("https://www.mangaread.org/manga/above-all-gods/chapter146-5/", { + "pattern": (r"https://www\.mangaread\.org/wp-content/uploads" + r"/WP-manga/data/manga_[^/]+/[^/]+/[^.]+\.\w+"), + "count": 6, + "keyword": { + "manga" : "Above All Gods", + "title" : "", + "chapter" : 146, + "chapter_minor": ".5", + "tags" : list, + "lang" : "en", + "language": "English", + } + }), + ) + + def metadata(self, page): + data = {"tags": list(text.extract_iter(page, 'class="">', "<"))} + info = text.extr(page, '

', "

") + if not info: + raise exception.NotFoundError("chapter") + self.parse_chapter_string(info, data) + return data + + def images(self, page): + page = text.extr( + page, '
', '
= 13", + "keyword": { + "manga" : "Kanan-sama wa Akumade Choroi", + "author" : ["nonco"], + "artist" : ["nonco"], + "type" : "Manga", + "genres" : ["Comedy", "Romance", "Shounen", "Supernatural"], + "rating" : float, + "release": 2022, + "status" : "OnGoing", + "lang" : "en", + "language" : "English", + "manga_alt" : list, + "description": str, + } + }), + ("https://www.mangaread.org/manga/one-piece", { + "pattern": (r"https://www\.mangaread\.org/manga" + r"/one-piece/chapter-\d+(-.+)?/"), + "count" : ">= 1066", + "keyword": { + "manga" : "One Piece", + "author" : ["Oda Eiichiro"], + "artist" : ["Oda Eiichiro"], + "type" : "Manga", + "genres" : list, + "rating" : float, + "release": 1997, + "status" : "OnGoing", + "lang" : "en", + "language" : "English", + "manga_alt" : ["One Piece"], + "description": str, + } + }), + ("https://www.mangaread.org/manga/doesnotexist", { + "exception": exception.HttpError, + }), + ) + + def chapters(self, page): + data = self.metadata(page) + result = [] + for chapter in text.extract_iter( + page, '
  • "): + url , pos = text.extract(chapter, '", "", pos) + self.parse_chapter_string(info, data) + result.append((url, data.copy())) + return result + + def metadata(self, page): + extr = text.extract_from(text.extr( + page, 'class="summary_content">', 'class="manga-action"')) + return { + "manga" : text.extr(page, "

    ", "

    ").strip(), + "description": text.unescape(text.remove_html(text.extract( + page, ">", "
  • ", page.index("summary__content"))[0])), + "rating" : text.parse_float( + extr('total_votes">', "").strip()), + "manga_alt" : text.remove_html( + extr("Alternative \n
    ", "
    ")).split("; "), + "author" : list(text.extract_iter( + extr('class="author-content">', "
    "), '"tag">', "")), + "artist" : list(text.extract_iter( + extr('class="artist-content">', "
    "), '"tag">', "")), + "genres" : list(text.extract_iter( + extr('class="genres-content">', "
    "), '"tag">', "")), + "type" : text.remove_html( + extr("Type \n
    ", "
    ")), + "release" : text.parse_int(text.remove_html( + extr("Release \n
    ", ""))), + "status" : text.remove_html( + extr("Status \n", "")), + } diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 65a759d943..770bf3587e 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -72,6 +72,7 @@ "mangalife" : "MangaLife", "manganelo" : "Manganato", "mangapark" : "MangaPark", + "mangaread" : "MangaRead", "mangasee" : "MangaSee", "mastodon.social": "mastodon.social", "mememuseum" : "meme.museum", From ade9789b3eb5fda667c792fea8e9808c291a7a2c Mon Sep 17 00:00:00 2001 From: enduser420 Date: Sun, 13 Nov 2022 16:04:17 +0530 Subject: [PATCH 011/252] [mangaread] update regex --- gallery_dl/extractor/mangaread.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py index b8cf1e88ef..4af90e09f1 100644 --- a/gallery_dl/extractor/mangaread.py +++ b/gallery_dl/extractor/mangaread.py @@ -106,7 +106,7 @@ def images(self, page): class MangareadMangaExtractor(MangareadBase, MangaExtractor): """Extractor for manga from mangaread.org""" chapterclass = MangareadChapterExtractor - pattern = r"(?:https?://)?(?:www\.)?mangaread\.org(/manga/[^/?#]+)" + pattern = r"(?:https?://)?(?:www\.)?mangaread\.org(/manga/[^/?#]+)/?$" test = ( ("https://www.mangaread.org/manga/kanan-sama-wa-akumade-choroi", { "pattern": (r"https://www\.mangaread\.org/manga" From 8cbc05786af370c49fdd37ffbce79d0d69837c80 Mon Sep 17 00:00:00 2001 From: 0x1f595 <0x1f595@users.noreply.github.com> Date: Tue, 29 Nov 2022 20:22:00 -0800 Subject: [PATCH 012/252] Add 8muses album permalink parts to album data This allows customizing the directory without breaking changes. --- gallery_dl/extractor/8muses.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py index fed4991334..4880493ab2 100644 --- a/gallery_dl/extractor/8muses.py +++ b/gallery_dl/extractor/8muses.py @@ -119,6 +119,7 @@ def _make_album(self, album): return { "id" : album["id"], "path" : album["path"], + "parts" : album["permalink"].split('/'), "title" : album["name"], "private": album["isPrivate"], "url" : self.root + album["permalink"], From 19ea6ee84fd1e228b1056db726cf67fb79453441 Mon Sep 17 00:00:00 2001 From: 0x1f595 <0x1f595@users.noreply.github.com> Date: Tue, 29 Nov 2022 20:27:01 -0800 Subject: [PATCH 013/252] Fix 8muses album URL, add permalink path --- gallery_dl/extractor/8muses.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py index 4880493ab2..56c880e34e 100644 --- a/gallery_dl/extractor/8muses.py +++ b/gallery_dl/extractor/8muses.py @@ -119,10 +119,11 @@ def _make_album(self, album): return { "id" : album["id"], "path" : album["path"], - "parts" : album["permalink"].split('/'), + "parts" : album["permalink"], "title" : album["name"], "private": album["isPrivate"], - "url" : self.root + album["permalink"], + "permalink" : album["permalink"], + "url" : self.root + "/" + album["permalink"], "parent" : text.parse_int(album["parentId"]), "views" : text.parse_int(album["numberViews"]), "likes" : text.parse_int(album["numberLikes"]), From 895b41f1acfc453be3dd0765f6a607158ebd4e74 Mon Sep 17 00:00:00 2001 From: lx30011 <64314625+lx30011@users.noreply.github.com> Date: Thu, 22 Dec 2022 23:50:56 +0100 Subject: [PATCH 014/252] [jschan] add generic jschan extractor --- docs/supportedsites.md | 10 ++++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/jschan.py | 96 ++++++++++++++++++++++++++++++++ scripts/supportedsites.py | 1 + 4 files changed, 108 insertions(+) create mode 100644 gallery_dl/extractor/jschan.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e6a1c845fd..0ce12f8a46 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1052,6 +1052,16 @@ Consider all sites to be NSFW unless otherwise known. + + jschan Imageboards + + + 94chan + https://94chan.org/ + Boards, Threads + + + LynxChan Imageboards diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 444075c151..b913e3c125 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -71,6 +71,7 @@ "instagram", "issuu", "itaku", + "jschan", "kabeuchi", "keenspot", "kemonoparty", diff --git a/gallery_dl/extractor/jschan.py b/gallery_dl/extractor/jschan.py new file mode 100644 index 0000000000..cc2c7deeb0 --- /dev/null +++ b/gallery_dl/extractor/jschan.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for jschan Imageboards""" + +from .common import BaseExtractor, Message +from .. import text +import itertools + + +class JschanExtractor(BaseExtractor): + basecategory = "jschan" + + +BASE_PATTERN = JschanExtractor.update({ + "94chan": { + "root": "https://94chan.org", + "pattern": r"94chan\.org" + } +}) + + +class JschanThreadExtractor(JschanExtractor): + """Extractor for jschan threads""" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", + "{threadId} {subject[:50]|message[:50]}") + filename_fmt = "{postId}{num:?-//} {filename}.{extension}" + archive_fmt = "{board}_{postId}_{num}" + pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)\.html" + test = ( + ("https://94chan.org/art/thread/25.html", { + "pattern": r"https://94chan.org/file/[0-9a-f]{64}(\.\w+)?", + "count": ">= 15" + }) + ) + + def __init__(self, match): + JschanExtractor.__init__(self, match) + index = match.lastindex + self.board = match.group(index-1) + self.thread = match.group(index) + + def items(self): + url = "{}/{}/thread/{}.json".format( + self.root, self.board, self.thread) + thread = self.request(url).json() + thread["threadId"] = thread["postId"] + posts = thread.pop("replies", ()) + + yield Message.Directory, thread + for post in itertools.chain((thread,), posts): + files = post.pop("files", ()) + if files: + thread.update(post) + for num, file in enumerate(files): + file.update(thread) + url = self.root + "/file/" + file["filename"] + file["num"] = num + file["count"] = len(files) + file["siteFilename"] = file["filename"] + text.nameext_from_url(file["originalFilename"], file) + yield Message.Url, url, file + + +class JschanBoardExtractor(JschanExtractor): + """Extractor for jschan boards""" + subcategory = "board" + pattern = ( + BASE_PATTERN + r"/([^/?#]+)(?:/index\.html|" + r"/catalog\.html|/\d+\.html|/?$)" + ) + test = ( + ("https://94chan.org/art/", { + "pattern": JschanThreadExtractor.pattern, + "count": ">= 30" + }), + ("https://94chan.org/art/2.html"), + ("https://94chan.org/art/catalog.html"), + ("https://94chan.org/art/index.html"), + ) + + def __init__(self, match): + JschanExtractor.__init__(self, match) + self.board = match.group(match.lastindex) + + def items(self): + url = "{}/{}/catalog.json".format(self.root, self.board) + for thread in self.request(url).json(): + url = "{}/{}/thread/{}.html".format( + self.root, self.board, thread["postId"]) + thread["_extractor"] = JschanThreadExtractor + yield Message.Queue, url, thread diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 1df98ca40c..e4c68eb6cd 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -254,6 +254,7 @@ "foolslide" : "FoOlSlide Instances", "gelbooru_v01": "Gelbooru Beta 0.1.11", "gelbooru_v02": "Gelbooru Beta 0.2", + "jschan" : "jschan Imageboards", "lolisafe" : "lolisafe and chibisafe", "lynxchan" : "LynxChan Imageboards", "moebooru" : "Moebooru and MyImouto", From df77271438238afac534fe48e23938a7f45289b6 Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Thu, 9 Mar 2023 20:55:28 +0800 Subject: [PATCH 015/252] [downloader:http] add 'consume-content' option * fix connection not being released when the response is neither successful nor retried * add the ability to consume the HTTP response body instead of closing the connection reference: https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow --- docs/configuration.rst | 19 +++++++++++++++++++ gallery_dl/downloader/http.py | 17 ++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index fbb0416b4a..c88f8eb1ff 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3616,6 +3616,25 @@ Description contains JPEG/JFIF data. +downloader.http.consume-content +--------------------------------- +Type + ``bool`` +Default + ``false`` +Description + Controls the behavior when an HTTP response is considered + unsuccessful + + If the value is ``true``, consume the response body. This + avoids closing the connection and therefore improves connection + reuse. + + If the value is ``false``, immediately close the connection + without reading the response. This can be useful if the server + is known to send large bodies for error responses. + + downloader.http.chunk-size -------------------------- Type diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index f14af24967..30b5971487 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -44,6 +44,12 @@ def __init__(self, job): self.mtime = self.config("mtime", True) self.rate = self.config("rate") + if not self.config("consume-content", False): + # this resets the underlying TCP connection, and therefore + # if the program makes another request to the same domain, + # a new connection (either TLS or plain TCP) must be made + self.release_conn = lambda resp: resp.close() + if self.retries < 0: self.retries = float("inf") if self.minsize: @@ -113,7 +119,7 @@ def _download_impl(self, url, pathfmt): while True: if tries: if response: - response.close() + self.release_conn(response) response = None self.log.warning("%s (%s/%s)", msg, tries, self.retries+1) if tries > self.retries: @@ -170,6 +176,7 @@ def _download_impl(self, url, pathfmt): if code in retry_codes or 500 <= code < 600: continue self.log.warning(msg) + self.release_conn(response) return False # check for invalid responses @@ -182,6 +189,7 @@ def _download_impl(self, url, pathfmt): continue if not result: self.log.warning("Invalid response") + self.release_conn(response) return False # check file size @@ -191,11 +199,13 @@ def _download_impl(self, url, pathfmt): self.log.warning( "File size smaller than allowed minimum (%s < %s)", size, self.minsize) + self.release_conn(response) return False if self.maxsize and size > self.maxsize: self.log.warning( "File size larger than allowed maximum (%s > %s)", size, self.maxsize) + self.release_conn(response) return False build_path = False @@ -284,6 +294,11 @@ def _download_impl(self, url, pathfmt): return True + def release_conn(self, response): + """Release connection back to pool by consuming response body""" + for _ in response.iter_content(self.chunk_size): + pass + @staticmethod def receive(fp, content, bytes_total, bytes_start): write = fp.write From fcaeaf539cb913fa7c0e076e34e48f1e37ccf545 Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Sat, 11 Mar 2023 21:36:37 +0800 Subject: [PATCH 016/252] [downloader:http] handle exceptions while consuming content --- docs/configuration.rst | 2 +- gallery_dl/downloader/http.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index c88f8eb1ff..a64322d823 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3617,7 +3617,7 @@ Description downloader.http.consume-content ---------------------------------- +------------------------------- Type ``bool`` Default diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 30b5971487..59cd0ac08c 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -296,8 +296,15 @@ def _download_impl(self, url, pathfmt): def release_conn(self, response): """Release connection back to pool by consuming response body""" - for _ in response.iter_content(self.chunk_size): - pass + try: + for _ in response.iter_content(self.chunk_size): + pass + except (RequestException, SSLError, OpenSSLError) as exc: + print() + self.log.debug( + "Unable to consume response body (%s); " + "closing the connection anyway", exc) + response.close() @staticmethod def receive(fp, content, bytes_total, bytes_start): From d4fb4ff47ff065c2224cb805859eb90e96de799a Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Sat, 18 Mar 2023 20:15:56 +0800 Subject: [PATCH 017/252] [twitter] extract TwitPic URLs in text (#3792) also ignore previously seen URLs --- gallery_dl/extractor/twitter.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 29b4ac3531..12e9ed9e7a 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -13,6 +13,7 @@ from ..cache import cache import itertools import json +import re BASE_PATTERN = r"(?:https?://)?(?:www\.|mobile\.)?(?:[fv]x)?twitter\.com" @@ -74,6 +75,10 @@ def items(self): else: seen_tweets = None + if self.twitpic: + self._find_twitpic = re.compile( + r"https?://(twitpic\.com/\w+)").finditer + for tweet in self.tweets(): if "legacy" in tweet: @@ -226,12 +231,27 @@ def _extract_card(self, tweet, files): files.append({"url": url}) def _extract_twitpic(self, tweet, files): - for url in tweet["entities"].get("urls", ()): + # collect urls + urls = [] + for url in tweet["entities"].get("urls") or (): url = url["expanded_url"] if "//twitpic.com/" not in url or "/photos/" in url: continue if url.startswith("http:"): url = "https" + url[4:] + urls.append(url) + tget = tweet.get + for match in self._find_twitpic( + tget("full_text") or tget("text") or ""): + urls.append(text.ensure_http_scheme(match.group(1))) + + # extract actual urls + seen = set() + for url in urls: + if url in seen: + self.log.debug("Skipping %s (previously seen)", url) + continue + seen.add(url) response = self.request(url, fatal=False) if response.status_code >= 400: continue @@ -724,7 +744,13 @@ class TwitterTweetExtractor(TwitterExtractor): ("https://twitter.com/i/web/status/112900228289540096", { "options": (("twitpic", True), ("cards", False)), "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg", - "count": 3, + "count": 2, # 1 duplicate + }), + # TwitPic URL not in 'urls' (#3792) + ("https://twitter.com/shimoigusaP/status/8138669971", { + "options": (("twitpic", True),), + "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.png", + "count": 1, }), # Twitter card (#1005) ("https://twitter.com/billboard/status/1306599586602135555", { From 1a977f0f62373cc53ef248f9ba901bbe43a01eb6 Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Thu, 23 Mar 2023 19:57:13 +0800 Subject: [PATCH 018/252] [downloader:http] handle exceptions in 'validate' This isn't strictly necessary for 'exhentai.py', but it improves efficiency when the adapter is reused --- gallery_dl/downloader/http.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 59cd0ac08c..b3f381ecdd 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -182,7 +182,11 @@ def _download_impl(self, url, pathfmt): # check for invalid responses validate = kwdict.get("_http_validate") if validate and self.validate: - result = validate(response) + try: + result = validate(response) + except Exception: + self.release_conn(response) + raise if isinstance(result, str): url = result tries -= 1 From 775d2ac9995d3efcffff6f696789677ee0f70e4e Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Fri, 31 Mar 2023 20:08:38 +0800 Subject: [PATCH 019/252] [downloader:http] improve error logging when releasing connection --- gallery_dl/downloader/http.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index b3f381ecdd..d8708fbaf5 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -306,8 +306,8 @@ def release_conn(self, response): except (RequestException, SSLError, OpenSSLError) as exc: print() self.log.debug( - "Unable to consume response body (%s); " - "closing the connection anyway", exc) + "Unable to consume response body (%s: %s); " + "closing the connection anyway", exc.__class__.__name__, exc) response.close() @staticmethod From cd4bfb0dd1e92db18209856f2f2df99718f9823a Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Thu, 20 Apr 2023 15:46:32 +0800 Subject: [PATCH 020/252] [reddit] match 'preview.redd.it' URLs --- gallery_dl/extractor/reddit.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 305de2a00d..d02c9d6865 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -303,8 +303,8 @@ class RedditImageExtractor(Extractor): category = "reddit" subcategory = "image" archive_fmt = "{filename}" - pattern = (r"(?:https?://)?i\.redd(?:\.it|ituploads\.com)" - r"/[^/?#]+(?:\?[^#]*)?") + pattern = (r"(?:https?://)?((?:i|preview)\.redd\.it|i\.reddituploads\.com)" + r"/([^/?#]+)(\?[^#]*)?") test = ( ("https://i.redd.it/upjtjcx2npzz.jpg", { "url": "0de614900feef103e580b632190458c0b62b641a", @@ -315,12 +315,29 @@ class RedditImageExtractor(Extractor): "url": "f24f25efcedaddeec802e46c60d77ef975dc52a5", "content": "541dbcc3ad77aa01ee21ca49843c5e382371fae7", }), + # preview.redd.it -> i.redd.it + (("https://preview.redd.it/00af44lpn0u51.jpg?width=960&crop=smart" + "&auto=webp&v=enabled&s=dbca8ab84033f4a433772d9c15dbe0429c74e8ac"), { + "pattern": r"^https://i\.redd\.it/00af44lpn0u51\.jpg$" + }), ) + def __init__(self, match): + Extractor.__init__(self, match) + domain = match.group(1) + self.path = match.group(2) + if domain == "preview.redd.it": + self.domain = "i.redd.it" + self.query = "" + else: + self.domain = domain + self.query = match.group(3) or "" + def items(self): - data = text.nameext_from_url(self.url) + url = "https://{}/{}{}".format(self.domain, self.path, self.query) + data = text.nameext_from_url(url) yield Message.Directory, data - yield Message.Url, self.url, data + yield Message.Url, url, data class RedditAPI(): From b81ce381f0ebf6cd8ca23b3f7959d67d2646b250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 21 Apr 2023 23:12:01 +0200 Subject: [PATCH 021/252] fix backticks for 'extractor.redgifs.format' docs https://github.com/mikf/gallery-dl/issues/146#issuecomment-1518331927 --- docs/configuration.rst | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index f5652b77f1..aa6b14e04f 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1651,7 +1651,11 @@ Default ``["mp4", "webm", "mobile", "gif"]`` Description List of names of the preferred animation format, which can be - ``"mp4"``, ``"webm"``, ``"mobile"``, ``"gif"``, or ``"webp"``. + ``"mp4"``, + ``"webm"``, + ``"mobile"``, + ``"gif"``, or + ``"webp"``. If a selected format is not available, the next one in the list will be tried until an available format is found. @@ -2602,7 +2606,12 @@ Default ``["hd", "sd", "gif"]`` Description List of names of the preferred animation format, which can be - ``"hd"``, ``"sd"``, `"gif"``, `"vthumbnail"``, `"thumbnail"``, or ``"poster"``. + ``"hd"``, + ``"sd"``, + ``"gif"``, + ``"thumbnail"``, + ``"vthumbnail"``, or + ``"poster"``. If a selected format is not available, the next one in the list will be tried until an available format is found. From a05120412af1c958714631fc85a69568b28cd5e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 23 Apr 2023 15:00:09 +0200 Subject: [PATCH 022/252] [oauth] catch exception from 'webbrowser.get()' (#3947) It raises an exception instead of returning None when no runnable browser is available. --- gallery_dl/extractor/oauth.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index ec46ca3bff..404f296d24 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -71,8 +71,11 @@ def open(self, url, params, recv=None): browser = self.config("browser", True) if browser: - import webbrowser - browser = webbrowser.get() + try: + import webbrowser + browser = webbrowser.get() + except Exception: + browser = None if browser and browser.open(url): name = getattr(browser, "name", "Browser") From 7f25cab56e148c4499c7b5173134a15e22105311 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 23 Apr 2023 16:46:40 +0200 Subject: [PATCH 023/252] [sankaku] support post URLs with MD5 hashes (#3952) --- gallery_dl/extractor/sankaku.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 789c63d655..f36051bb46 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -168,7 +168,7 @@ class SankakuPostExtractor(SankakuExtractor): """Extractor for single posts from sankaku.app""" subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/post/show/(\d+)" + pattern = BASE_PATTERN + r"/post/show/([0-9a-f]+)" test = ( ("https://sankaku.app/post/show/360451", { "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", @@ -196,6 +196,17 @@ class SankakuPostExtractor(SankakuExtractor): "tags_general": ["key(mangaka)", "key(mangaka)"], }, }), + # md5 hexdigest instead of ID (#3952) + (("https://chan.sankakucomplex.com/post/show" + "/f8ba89043078f0e4be2d9c46550b840a"), { + "pattern": r"https://s\.sankakucomplex\.com" + r"/data/f8/ba/f8ba89043078f0e4be2d9c46550b840a\.jpg", + "count": 1, + "keyword": { + "id": 33195194, + "md5": "f8ba89043078f0e4be2d9c46550b840a", + }, + }), ("https://chan.sankakucomplex.com/post/show/360451"), ("https://chan.sankakucomplex.com/ja/post/show/360451"), ("https://beta.sankakucomplex.com/post/show/360451"), @@ -263,7 +274,7 @@ def posts(self, post_id): "lang" : "en", "page" : "1", "limit": "1", - "tags" : "id_range:" + post_id, + "tags" : ("md5:" if len(post_id) == 32 else "id_range:") + post_id, } return self._call("/posts", params) From 5d7435e8032a8483d4f9509e8591ff36eb914f16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 23 Apr 2023 19:13:27 +0200 Subject: [PATCH 024/252] [nitter] extract user IDs from encoded banner URLs still requires a banner to be present to begin with --- gallery_dl/extractor/nitter.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index 5f4ceeafc0..beb3da2557 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -162,7 +162,11 @@ def _user_from_html(self, html): banner = extr('class="profile-banner"> Date: Sun, 23 Apr 2023 21:10:16 +0200 Subject: [PATCH 025/252] [deviantart] revert e9353c63; retry downloads with private token --- gallery_dl/extractor/deviantart.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index f532a9766d..18d9867b46 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1417,7 +1417,14 @@ def deviation_download(self, deviation_id, public=None): """Get the original file download (if allowed)""" endpoint = "/deviation/download/" + deviation_id params = {"mature_content": self.mature} - return self._call(endpoint, params=params, public=public) + + try: + return self._call( + endpoint, params=params, public=public, log=False) + except Exception: + if not self.refresh_token_key: + raise + return self._call(endpoint, params=params, public=False) def deviation_metadata(self, deviations): """ Fetch deviation metadata for a set of deviations""" @@ -1518,7 +1525,7 @@ def _authenticate_impl(self, refresh_token_key): refresh_token_key, data["refresh_token"]) return "Bearer " + data["access_token"] - def _call(self, endpoint, fatal=True, public=None, **kwargs): + def _call(self, endpoint, fatal=True, log=True, public=None, **kwargs): """Call an API endpoint""" url = "https://www.deviantart.com/api/v1/oauth2" + endpoint kwargs["fatal"] = None @@ -1563,7 +1570,8 @@ def _call(self, endpoint, fatal=True, public=None, **kwargs): "cs/configuration.rst#extractordeviantartclient-id" "--client-secret") else: - self.log.error(msg) + if log: + self.log.error(msg) return data def _pagination(self, endpoint, params, @@ -1571,15 +1579,14 @@ def _pagination(self, endpoint, params, warn = True if public is None: public = self.public - elif not public: - self.public = False while True: data = self._call(endpoint, params=params, public=public) - if key not in data: + try: + results = data[key] + except KeyError: self.log.error("Unexpected API response: %s", data) return - results = data[key] if unpack: results = [item["journal"] for item in results @@ -1588,7 +1595,7 @@ def _pagination(self, endpoint, params, if public and len(results) < params["limit"]: if self.refresh_token_key: self.log.debug("Switching to private access token") - self.public = public = False + public = False continue elif data["has_more"] and warn: warn = False From 6f4a843fbae3026f293d804f730dc1e778d66a8c Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Mon, 24 Apr 2023 23:59:36 +0800 Subject: [PATCH 026/252] [downloader:http] release connection before logging messages This allows connections to be properly released when using 'actions' feature. --- gallery_dl/downloader/http.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index d8708fbaf5..434689f15a 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -175,8 +175,8 @@ def _download_impl(self, url, pathfmt): msg = "'{} {}' for '{}'".format(code, response.reason, url) if code in retry_codes or 500 <= code < 600: continue - self.log.warning(msg) self.release_conn(response) + self.log.warning(msg) return False # check for invalid responses @@ -192,24 +192,24 @@ def _download_impl(self, url, pathfmt): tries -= 1 continue if not result: - self.log.warning("Invalid response") self.release_conn(response) + self.log.warning("Invalid response") return False # check file size size = text.parse_int(size, None) if size is not None: if self.minsize and size < self.minsize: + self.release_conn(response) self.log.warning( "File size smaller than allowed minimum (%s < %s)", size, self.minsize) - self.release_conn(response) return False if self.maxsize and size > self.maxsize: + self.release_conn(response) self.log.warning( "File size larger than allowed maximum (%s > %s)", size, self.maxsize) - self.release_conn(response) return False build_path = False From de670bd7de8600a1481ee6366680d3e08659f0f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 24 Apr 2023 20:07:10 +0200 Subject: [PATCH 027/252] [tumblr] update pagination logic (#2191) --- gallery_dl/extractor/tumblr.py | 110 +++++++++++++++++++++------------ 1 file changed, 69 insertions(+), 41 deletions(-) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 155db1e5b0..1473e14834 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -269,7 +269,7 @@ def _original_image_fallback(self, url, post_id): class TumblrUserExtractor(TumblrExtractor): - """Extractor for all images from a tumblr-user""" + """Extractor for a Tumblr user's posts""" subcategory = "user" pattern = BASE_PATTERN + r"(?:/page/\d+|/archive)?/?$" test = ( @@ -307,6 +307,16 @@ class TumblrUserExtractor(TumblrExtractor): "options": (("date-min", "201804"), ("date-max", "201805"), ("date-format", "%Y%m")) }), + # pagination with 'date-max' (#2191) and 'api-key' + ("https://donttrustthetits.tumblr.com/", { + "options": ( + ("access-token", None), + ("original", False), + ("date-max", "2015-04-25T00:00:00"), + ("date-min", "2015-04-01T00:00:00"), + ), + "count": 316, + }), ("https://demo.tumblr.com/page/2"), ("https://demo.tumblr.com/archive"), ("tumblr:http://www.b-authentique.com/"), @@ -321,7 +331,7 @@ def posts(self): class TumblrPostExtractor(TumblrExtractor): - """Extractor for images from a single post on tumblr""" + """Extractor for a single Tumblr post""" subcategory = "post" pattern = BASE_PATTERN + r"/(?:post/|image/)?(\d+)" test = ( @@ -389,7 +399,7 @@ def _setup_posttypes(): class TumblrTagExtractor(TumblrExtractor): - """Extractor for images from a tumblr-user by tag""" + """Extractor for Tumblr user's posts by tag""" subcategory = "tag" pattern = BASE_PATTERN + r"/tagged/([^/?#]+)" test = ( @@ -412,7 +422,7 @@ def posts(self): class TumblrLikesExtractor(TumblrExtractor): - """Extractor for images from a tumblr-user's liked posts""" + """Extractor for a Tumblr user's liked posts""" subcategory = "likes" directory_fmt = ("{category}", "{blog_name}", "likes") archive_fmt = "f_{blog[name]}_{id}_{num}" @@ -431,7 +441,11 @@ def posts(self): class TumblrAPI(oauth.OAuth1API): - """Minimal interface for the Tumblr API v2""" + """Interface for the Tumblr API v2 + + https://github.com/tumblr/docs/blob/master/api.md + """ + ROOT = "https://api.tumblr.com" API_KEY = "O3hU2tMi5e4Qs5t3vezEi6L0qRORJ5y9oUpSGsrWu8iA3UCc3B" API_SECRET = "sFdsK3PDdP2QpYMRAoq0oDnw0sFS24XigXmdfnaeNZpJpqAn03" BLOG_CACHE = {} @@ -442,55 +456,46 @@ def __init__(self, extractor): def info(self, blog): """Return general information about a blog""" - if blog not in self.BLOG_CACHE: - self.BLOG_CACHE[blog] = self._call(blog, "info", {})["blog"] - return self.BLOG_CACHE[blog] + try: + return self.BLOG_CACHE[blog] + except KeyError: + endpoint = "/v2/blog/{}/info".format(blog) + params = {"api_key": self.api_key} if self.api_key else None + self.BLOG_CACHE[blog] = blog = self._call(endpoint, params)["blog"] + return blog def avatar(self, blog, size="512"): """Retrieve a blog avatar""" if self.api_key: - url_fmt = "https://api.tumblr.com/v2/blog/{}/avatar/{}?api_key={}" - return url_fmt.format(blog, size, self.api_key) + return "{}/v2/blog/{}/avatar/{}?api_key={}".format( + self.ROOT, blog, size, self.api_key) + endpoint = "/v2/blog/{}/avatar".format(blog) params = {"size": size} - data = self._call(blog, "avatar", params, allow_redirects=False) - return data["avatar_url"] + return self._call( + endpoint, params, allow_redirects=False)["avatar_url"] def posts(self, blog, params): """Retrieve published posts""" - params["offset"] = self.extractor.config("offset") or 0 - params["limit"] = 50 + params["offset"] = self.extractor.config("offset") + params["limit"] = "50" params["reblog_info"] = "true" + params["type"] = self.posts_type + params["before"] = self.before - if self.posts_type: - params["type"] = self.posts_type - if self.before: - params["before"] = self.before + if self.before and params["offset"]: + self.log.warning("'offset' and 'date-max' cannot be used together") - while True: - data = self._call(blog, "posts", params) - self.BLOG_CACHE[blog] = data["blog"] - yield from data["posts"] - params["offset"] += params["limit"] - if params["offset"] >= data["total_posts"]: - return + return self._pagination(blog, "/posts", params, cache=True) def likes(self, blog): """Retrieve liked posts""" params = {"limit": "50", "before": self.before} - while True: - posts = self._call(blog, "likes", params)["liked_posts"] - if not posts: - return - yield from posts - params["before"] = posts[-1]["liked_timestamp"] + return self._pagination(blog, "/likes", params, key="liked_posts") - def _call(self, blog, endpoint, params, **kwargs): - if self.api_key: - params["api_key"] = self.api_key - url = "https://api.tumblr.com/v2/blog/{}/{}".format( - blog, endpoint) - - response = self.request(url, params=params, **kwargs) + def _call(self, endpoint, params, **kwargs): + url = self.ROOT + endpoint + kwargs["params"] = params + response = self.request(url, **kwargs) try: data = response.json() @@ -535,7 +540,7 @@ def _call(self, blog, endpoint, params, **kwargs): if self.extractor.config("ratelimit") == "wait": self.extractor.wait(seconds=reset) - return self._call(blog, endpoint, params) + return self._call(endpoint, params, **kwargs) t = (datetime.now() + timedelta(seconds=float(reset))).time() raise exception.StopExtraction( @@ -547,6 +552,29 @@ def _call(self, blog, endpoint, params, **kwargs): if reset: self.log.info("Hourly API rate limit exceeded") self.extractor.wait(seconds=reset) - return self._call(blog, endpoint, params) + return self._call(endpoint, params, **kwargs) raise exception.StopExtraction(data) + + def _pagination(self, blog, endpoint, params, key="posts", cache=False): + endpoint = "/v2/blog/{}{}".format(blog, endpoint) + if self.api_key: + params["api_key"] = self.api_key + + while True: + data = self._call(endpoint, params) + + if cache: + self.BLOG_CACHE[blog] = data["blog"] + cache = False + + yield from data[key] + + try: + endpoint = data["_links"]["next"]["href"] + except KeyError: + return + + params = None + if self.api_key: + endpoint += "&api_key=" + self.api_key From 5297ee0cd9410642be5e214e120e20bb3b39a6ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 24 Apr 2023 22:01:47 +0200 Subject: [PATCH 028/252] [tumblr] add 'day' extractor (#3951) --- docs/supportedsites.md | 2 +- gallery_dl/extractor/tumblr.py | 31 ++++++++++++++++++++++++++++++- scripts/supportedsites.py | 3 +++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c40ea813bf..4db505fb2a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -844,7 +844,7 @@ Consider all sites to be NSFW unless otherwise known. Tumblr https://www.tumblr.com/ - Likes, Posts, Tag Searches, User Profiles + Days, Likes, Posts, Tag Searches, User Profiles OAuth diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 1473e14834..b45609d729 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -10,7 +10,7 @@ from .common import Extractor, Message from .. import text, oauth, exception -from datetime import datetime, timedelta +from datetime import datetime, date, timedelta import re @@ -421,6 +421,35 @@ def posts(self): return self.api.posts(self.blog, {"tag": self.tag}) +class TumblrDayExtractor(TumblrExtractor): + """Extractor for Tumblr user's posts by day""" + subcategory = "day" + pattern = BASE_PATTERN + r"/day/(\d\d\d\d/\d\d/\d\d)" + test = ( + ("https://mikf123.tumblr.com/day/2018/01/05", { + "pattern": r"https://64\.media\.tumblr\.com" + r"/1a2be8c63f1df58abd2622861696c72a" + r"/tumblr_ozm9nqst9t1wgha4yo1_1280\.jpg", + "keyword": {"id": 169341068404}, + "count": 1, + }), + ("https://www.tumblr.com/blog/view/mikf123/day/2018/01/05"), + ("https://www.tumblr.com/blog/mikf123/day/2018/01/05"), + ("https://www.tumblr.com/mikf123/day/2018/01/05"), + ) + + def __init__(self, match): + TumblrExtractor.__init__(self, match) + year, month, day = match.group(4).split("/") + self.date_min = ts = ( + # 719163 == date(1970, 1, 1).toordinal() + date(int(year), int(month), int(day)).toordinal() - 719163) * 86400 + self.api.before = ts + 86400 + + def posts(self): + return self.api.posts(self.blog, {}) + + class TumblrLikesExtractor(TumblrExtractor): """Extractor for a Tumblr user's liked posts""" subcategory = "likes" diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 74100d4f78..93ed809baa 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -229,6 +229,9 @@ "smugmug": { "path": "Images from Users and Folders", }, + "tumblr": { + "day": "Days", + }, "twitter": { "media": "Media Timelines", "tweets": "", From 7459e4abcea68e9d3e3f43f8026c89bc414fb942 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 25 Apr 2023 14:17:25 +0200 Subject: [PATCH 029/252] [postprocessor:metadata] fix traversing more than 1 level deep for mode 'modify' and 'delete' --- gallery_dl/postprocessor/metadata.py | 23 +++++++++----- test/test_postprocessor.py | 46 +++++++++++++++++----------- 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 714f4fefad..03294b483f 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -124,10 +124,8 @@ def _run_modify(self, pathfmt): for key, func in self.fields.items(): obj = kwdict try: - while "[" in key: - name, _, key = key.partition("[") - obj = obj[name] - key = key.rstrip("]") + if "[" in key: + obj, key = _traverse(obj, key) obj[key] = func(kwdict) except Exception: pass @@ -137,10 +135,8 @@ def _run_delete(self, pathfmt): for key in self.fields: obj = kwdict try: - while "[" in key: - name, _, key = key.partition("[") - obj = obj[name] - key = key.rstrip("]") + if "[" in key: + obj, key = _traverse(obj, key) del obj[key] except Exception: pass @@ -214,4 +210,15 @@ def _make_encoder(options, indent=None): ) +def _traverse(obj, key): + name, _, key = key.partition("[") + obj = obj[name] + + while "[" in key: + name, _, key = key.partition("[") + obj = obj[name.rstrip("]")] + + return obj, key.strip("]") + + __postprocessor__ = MetadataPP diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index c78d7b03d6..543947b8c6 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -392,47 +392,57 @@ def test_metadata_modify(self): self._create({ "mode": "modify", "fields": { - "foo" : "{filename}-{foo!s}", - "foo2" : "\fE bar['bax'] + 122", - "bar[baz]": "{_now}", - "bar[ba2]": "test", + "foo" : "{filename}-{foo!s}", + "foo2" : "\fE bar['bax'] + 122", + "bar[baz]" : "{_now}", + "bar[ba2]" : "\fE {}", + "bar[ba2][a]": "test", }, }, kwdict) - pdict = self.pathfmt.kwdict + pdict = self.pathfmt.kwdict self.assertIsNot(kwdict, pdict) self.assertEqual(pdict["foo"], kwdict["foo"]) self.assertEqual(pdict["bar"], kwdict["bar"]) self._trigger() - self.assertEqual(pdict["foo"] , "file-0") - self.assertEqual(pdict["foo2"] , 123) - self.assertEqual(pdict["bar"]["ba2"], "test") + self.assertEqual(pdict["foo"] , "file-0") + self.assertEqual(pdict["foo2"], 123) + self.assertEqual(pdict["bar"]["ba2"]["a"], "test") self.assertIsInstance(pdict["bar"]["baz"], datetime) def test_metadata_delete(self): - kwdict = {"foo": 0, "bar": {"bax": 1, "bay": 2, "baz": 3}} - self._create({"mode": "delete", "fields": ["foo", "bar[baz]"]}, kwdict) - pdict = self.pathfmt.kwdict + kwdict = { + "foo": 0, + "bar": { + "bax": 1, + "bay": 2, + "baz": {"a": 3, "b": 4}, + }, + } + self._create({ + "mode": "delete", + "fields": ["foo", "bar[bax]", "bar[baz][a]"], + }, kwdict) + pdict = self.pathfmt.kwdict self.assertIsNot(kwdict, pdict) + self.assertEqual(pdict["foo"], kwdict["foo"]) self.assertEqual(pdict["bar"], kwdict["bar"]) - del kwdict["foo"] - del kwdict["bar"]["baz"] - self._trigger() + self.assertNotIn("foo", pdict) - self.assertNotIn("baz", pdict["bar"]) - self.assertEqual(kwdict["bar"], pdict["bar"]) + self.assertNotIn("bax", pdict["bar"]) + self.assertNotIn("a", pdict["bar"]["baz"]) # no errors for deleted/undefined fields self._trigger() self.assertNotIn("foo", pdict) - self.assertNotIn("baz", pdict["bar"]) - self.assertEqual(kwdict["bar"], pdict["bar"]) + self.assertNotIn("bax", pdict["bar"]) + self.assertNotIn("a", pdict["bar"]["baz"]) def test_metadata_option_skip(self): self._create({"skip": True}) From 3905f05f000dee6497e7e4fecaf8b846713d8697 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 25 Apr 2023 14:30:18 +0200 Subject: [PATCH 030/252] [postprocessor:metadata] support putting keys in quotes for mode 'modify' and 'delete' based on fe41a2b1 --- gallery_dl/postprocessor/metadata.py | 4 ++-- test/test_postprocessor.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 03294b483f..5004bed6de 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -216,9 +216,9 @@ def _traverse(obj, key): while "[" in key: name, _, key = key.partition("[") - obj = obj[name.rstrip("]")] + obj = obj[name.strip("\"']")] - return obj, key.strip("]") + return obj, key.strip("\"']") __postprocessor__ = MetadataPP diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index 543947b8c6..1630e8a3a5 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -392,11 +392,11 @@ def test_metadata_modify(self): self._create({ "mode": "modify", "fields": { - "foo" : "{filename}-{foo!s}", - "foo2" : "\fE bar['bax'] + 122", - "bar[baz]" : "{_now}", - "bar[ba2]" : "\fE {}", - "bar[ba2][a]": "test", + "foo" : "{filename}-{foo!s}", + "foo2" : "\fE bar['bax'] + 122", + "bar['baz']" : "{_now}", + "bar[\"ba2\"]" : "\fE {}", + "bar['ba2'][a]": "test", }, }, kwdict) @@ -423,7 +423,7 @@ def test_metadata_delete(self): } self._create({ "mode": "delete", - "fields": ["foo", "bar[bax]", "bar[baz][a]"], + "fields": ["foo", "bar['bax']", "bar[\"baz\"][a]"], }, kwdict) pdict = self.pathfmt.kwdict From fd3b5b457a2d69e9ffd0942691a2f172ee86c97f Mon Sep 17 00:00:00 2001 From: anelki Date: Tue, 25 Apr 2023 10:06:45 -0500 Subject: [PATCH 031/252] added Macports install instructions to README --- README.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.rst b/README.rst index c3d09ff1cc..2ca566210a 100644 --- a/README.rst +++ b/README.rst @@ -123,6 +123,15 @@ For macOS or Linux users using Homebrew: brew install gallery-dl +MacPorts +-------- + +For macOS users with MacPorts: + +.. code:: bash + + sudo port install gallery-dl + Usage ===== From 28419bf45a473267a123469b55ebdd92f4700015 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 26 Apr 2023 18:50:09 +0200 Subject: [PATCH 032/252] [itchio] add 'game' extractor (#3923) --- docs/supportedsites.md | 6 +++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/itchio.py | 82 ++++++++++++++++++++++++++++++++ scripts/supportedsites.py | 1 + 4 files changed, 90 insertions(+) create mode 100644 gallery_dl/extractor/itchio.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 4db505fb2a..c0b890b02b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -427,6 +427,12 @@ Consider all sites to be NSFW unless otherwise known. Galleries, individual Images + + itch.io + https://itch.io/ + Games + + Keenspot http://www.keenspot.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 7cc12108cd..3a76bdc723 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -74,6 +74,7 @@ "instagram", "issuu", "itaku", + "itchio", "kabeuchi", "keenspot", "kemonoparty", diff --git a/gallery_dl/extractor/itchio.py b/gallery_dl/extractor/itchio.py new file mode 100644 index 0000000000..6034d12251 --- /dev/null +++ b/gallery_dl/extractor/itchio.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://itch.io/""" + +from .common import Extractor, Message +from .. import text + + +class ItchioGameExtractor(Extractor): + """Extractor for itch.io games""" + category = "itchio" + subcategory = "game" + root = "https://itch.io" + directory_fmt = ("{category}", "{user[name]}") + filename_fmt = "{game[title]} ({id}).{extension}" + archive_fmt = "{id}" + pattern = r"(?:https?://)?(\w+).itch\.io/([\w-]+)" + test = ( + ("https://sirtartarus.itch.io/a-craft-of-mine", { + "pattern": r"https://\w+\.ssl\.hwcdn\.net/upload2" + r"/game/1983311/7723751\?", + "count": 1, + "keyword": { + "extension": "", + "filename": "7723751", + "game": { + "id": 1983311, + "noun": "game", + "title": "A Craft Of Mine", + "url": "https://sirtartarus.itch.io/a-craft-of-mine", + }, + "user": { + "id": 4060052, + "name": "SirTartarus", + "url": "https://sirtartarus.itch.io", + }, + }, + }), + ) + + def __init__(self, match): + self.user, self.slug = match.groups() + Extractor.__init__(self, match) + + def items(self): + game_url = "https://{}.itch.io/{}".format(self.user, self.slug) + page = self.request(game_url).text + + params = { + "source": "view_game", + "as_props": "1", + "after_download_lightbox": "true", + } + headers = { + "Referer": game_url, + "X-Requested-With": "XMLHttpRequest", + "Origin": "https://{}.itch.io".format(self.user), + } + data = { + "csrf_token": text.unquote(self.session.cookies["itchio_token"]), + } + + for upload_id in text.extract_iter(page, 'data-upload_id="', '"'): + file_url = "{}/file/{}".format(game_url, upload_id) + info = self.request(file_url, method="POST", params=params, + headers=headers, data=data).json() + + game = info["lightbox"]["game"] + user = info["lightbox"]["user"] + game["url"] = game_url + user.pop("follow_button", None) + game = {"game": game, "user": user, "id": upload_id} + + url = info["url"] + yield Message.Directory, game + yield Message.Url, url, text.nameext_from_url(url, game) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 93ed809baa..1ebebddcf3 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -63,6 +63,7 @@ "imgth" : "imgth", "imgur" : "imgur", "joyreactor" : "JoyReactor", + "itchio" : "itch.io", "kabeuchi" : "かべうち", "kemonoparty" : "Kemono", "lineblog" : "LINE BLOG", From 7865067d199858e74343d809a4a2b9f6674f6fe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 26 Apr 2023 18:01:07 +0200 Subject: [PATCH 033/252] [shimmie2] add generic extractors for Shimmie2 sites (#3734) add support for - loudbooru.com (#3734) - booru.cavemanon.xyz (#3734) - giantessbooru.com (#943) - tentaclerape.net --- docs/supportedsites.md | 40 +++- gallery_dl/extractor/__init__.py | 2 +- gallery_dl/extractor/paheal.py | 4 +- gallery_dl/extractor/shimmie2.py | 326 +++++++++++++++++++++++++++++++ 4 files changed, 363 insertions(+), 9 deletions(-) create mode 100644 gallery_dl/extractor/shimmie2.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c0b890b02b..ebdc666117 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -541,12 +541,6 @@ Consider all sites to be NSFW unless otherwise known. Albums, Channels Supported - - meme.museum - https://meme.museum/ - Posts, Tag Searches - - My Hentai Gallery https://myhentaigallery.com/ @@ -1266,6 +1260,40 @@ Consider all sites to be NSFW unless otherwise known. + + Shimmie2 Instances + + + meme.museum + https://meme.museum/ + Posts, Tag Searches + + + + Loudbooru + https://loudbooru.com/ + Posts, Tag Searches + + + + Giantessbooru + https://giantessbooru.com/ + Posts, Tag Searches + + + + Tentaclerape + https://tentaclerape.net/ + Posts, Tag Searches + + + + Cavemanon + https://booru.cavemanon.xyz/ + Posts, Tag Searches + + + szurubooru Instances diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 3a76bdc723..9841ca7d63 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -94,7 +94,6 @@ "mangapark", "mangasee", "mangoxo", - "mememuseum", "misskey", "myhentaigallery", "myportfolio", @@ -134,6 +133,7 @@ "seiga", "senmanga", "sexcom", + "shimmie2", "simplyhentai", "skeb", "slickpic", diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 56e3b39455..f0a50c8416 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2022 Mike Fährmann +# Copyright 2018-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -14,7 +14,7 @@ class PahealExtractor(Extractor): """Base class for paheal extractors""" - basecategory = "booru" + basecategory = "shimmie2" category = "paheal" filename_fmt = "{category}_{id}_{md5}.{extension}" archive_fmt = "{id}" diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py new file mode 100644 index 0000000000..285cd8fed6 --- /dev/null +++ b/gallery_dl/extractor/shimmie2.py @@ -0,0 +1,326 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Shimmie2 instances""" + +from .common import BaseExtractor, Message +from .. import text + + +class Shimmie2Extractor(BaseExtractor): + """Base class for shimmie2 extractors""" + basecategory = "shimmie2" + filename_fmt = "{category}_{id}{md5:?_//}.{extension}" + archive_fmt = "{id}" + + def __init__(self, match): + BaseExtractor.__init__(self, match) + + try: + instance = INSTANCES[self.category] + except KeyError: + pass + else: + cookies = instance.get("cookies") + if cookies: + domain = self.root.rpartition("/")[2] + self._update_cookies_dict(cookies, domain=domain) + file_url = instance.get("file_url") + if file_url: + self.file_url_fmt = file_url + + def items(self): + data = self.metadata() + + for post in self.posts(): + + for key in ("id", "width", "height"): + post[key] = text.parse_int(post[key]) + post["tags"] = text.unquote(post["tags"]) + post.update(data) + + url = post["file_url"] + if "/index.php?" in url: + post["filename"], _, post["extension"] = \ + url.rpartition("/")[2].rpartition(".") + else: + text.nameext_from_url(url, post) + + yield Message.Directory, post + yield Message.Url, url, post + + def metadata(self): + """Return general metadata""" + return () + + def posts(self): + """Return an iterable containing data of all relevant posts""" + return () + + +INSTANCES = { + "mememuseum": { + "root": "https://meme.museum", + "pattern": r"meme\.museum", + }, + "loudbooru": { + "root": "https://loudbooru.com", + "pattern": r"loudbooru\.com", + "cookies": {"ui-tnc-agreed": "true"}, + }, + "giantessbooru": { + "root": "https://giantessbooru.com", + "pattern": r"giantessbooru\.com", + "cookies": {"agreed": "true"}, + }, + "tentaclerape": { + "root": "https://tentaclerape.net", + "pattern": r"tentaclerape\.net", + }, + "cavemanon": { + "root": "https://booru.cavemanon.xyz", + "pattern": r"booru\.cavemanon\.xyz", + "file_url": "{0}/index.php?q=image/{2}.{4}" + }, +} + +BASE_PATTERN = Shimmie2Extractor.update(INSTANCES) + r"/(?:index\.php\?q=)?" + + +class Shimmie2TagExtractor(Shimmie2Extractor): + """Extractor for shimmie2 posts by tag search""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + file_url_fmt = "{}/_images/{}/{}%20-%20{}.{}" + pattern = BASE_PATTERN + r"post/list/([^/?#]+)(?:/(\d+))?()" + test = ( + ("https://meme.museum/post/list/animated/1", { + "pattern": r"https://meme\.museum/_images/\w+/\d+%20-%20", + "count": ">= 30" + }), + ("https://loudbooru.com/post/list/original_character/1", { + "pattern": r"https://loudbooru\.com/_images/[0-9a-f]{32}/\d+", + "range": "1-100", + "count": 100, + }), + ("https://giantessbooru.com/post/list/smiling/1", { + "pattern": r"https://giantessbooru\.com/_images/[0-9a-f]{32}/\d+", + "range": "1-100", + "count": 100, + }), + ("https://tentaclerape.net/post/list/comic/1", { + "pattern": r"https://tentaclerape\.net/_images/[0-9a-f]{32}/\d+", + "range": "1-100", + "count": 100, + }), + ("https://booru.cavemanon.xyz/index.php?q=post/list/Amber/1", { + "pattern": r"https://booru\.cavemanon\.xyz" + r"/index\.php\?q=image/\d+\.\w+", + "range": "1-100", + "count": 100, + }), + ) + + def __init__(self, match): + Shimmie2Extractor.__init__(self, match) + lastindex = match.lastindex + self.tags = text.unquote(match.group(lastindex-2)) + self.page = match.group(lastindex-1) + + def metadata(self): + return {"search_tags": self.tags} + + def posts(self): + pnum = text.parse_int(self.page, 1) + file_url_fmt = self.file_url_fmt.format + + init = True + mime = "" + + while True: + url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) + page = self.request(url).text + extr = text.extract_from(page) + + if init: + init = False + has_mime = ("data-mime='" in page) + has_pid = ("data-post-id='" in page) + + while True: + if has_mime: + mime = extr("data-mime='", "'") + if has_pid: + pid = extr("data-post-id='", "'") + else: + pid = extr("href='/post/view/", "?") + + if not pid: + break + + tags, dimensions, size = extr("title='", "'").split(" // ") + width, _, height = dimensions.partition("x") + md5 = extr("/_thumbs/", "/") + + yield { + "file_url": file_url_fmt( + self.root, md5, pid, text.quote(tags), + mime.rpartition("/")[2] if mime else "jpg"), + "id": pid, + "md5": md5, + "tags": tags, + "width": width, + "height": height, + "size": text.parse_bytes(size[:-1]), + } + + pnum += 1 + if not extr(">Next<", ">"): + if not extr("/{}'>{}<".format(pnum, pnum), ">"): + return + + +class Shimmie2PostExtractor(Shimmie2Extractor): + """Extractor for single shimmie2 posts""" + subcategory = "post" + pattern = BASE_PATTERN + r"post/view/(\d+)" + test = ( + ("https://meme.museum/post/view/10243", { + "pattern": r"https://meme\.museum/_images/105febebcd5ca791ee332adc" + r"49971f78/10243%20-%20g%20beard%20open_source%20richar" + r"d_stallman%20stallman%20tagme%20text\.jpg", + "content": "45565f3f141fc960a8ae1168b80e718a494c52d2", + "keyword": { + "extension": "jpg", + "file_url": "https://meme.museum/_images/105febebcd5ca791ee332" + "adc49971f78/10243%20-%20g%20beard%20open_source%2" + "0richard_stallman%20stallman%20tagme%20text.jpg", + "filename": "10243 - g beard open_source richard_stallman " + "stallman tagme text", + "height": 451, + "id": 10243, + "md5": "105febebcd5ca791ee332adc49971f78", + "size": 0, + "subcategory": "post", + "tags": "/g/ beard open_source " + "richard_stallman stallman tagme text", + "width": 480, + }, + }), + ("https://loudbooru.com/post/view/33828", { + "pattern": r"https://loudbooru\.com/_images/.+\.png", + "content": "a4755f787ba23ae2aa297a46810f802ca9032739", + "keyword": { + "extension": "png", + "file_url": "https://loudbooru.com/_images/ca2638d903c86e8337f" + "e9aeb4974be88/33828%20-%202020%20artist%3Astikyfi" + "nkaz%20character%3Alisa_loud%20cover%20fanfiction" + "%3Aplatz_eins%20frowning%20half-closed_eyes%20sol" + "o%20text%20title_card.png", + "filename": "33828 - 2020 artist:stikyfinkaz character:lisa_" + "loud cover fanfiction:platz_eins frowning " + "half-closed_eyes solo text title_card", + "height": 1920, + "id": 33828, + "md5": "ca2638d903c86e8337fe9aeb4974be88", + "tags": "2020 artist:stikyfinkaz character:lisa_loud cover " + "fanfiction:platz_eins frowning half-closed_eyes " + "solo text title_card", + "width": 1078, + }, + }), + ("https://giantessbooru.com/post/view/41", { + "pattern": r"https://giantessbooru\.com/_images" + r"/3f67e1986496806b7b14ff3e82ac5af4/41\.jpg", + "content": "79115ed309d1f4e82e7bead6948760e889139c91", + "keyword": { + "extension": "jpg", + "file_url": "https://giantessbooru.com/_images" + "/3f67e1986496806b7b14ff3e82ac5af4/41.jpg", + "filename": "41", + "height": 0, + "id": 41, + "md5": "3f67e1986496806b7b14ff3e82ac5af4", + "size": 0, + "tags": "anime bare_midriff color drawing gentle giantess " + "karbo looking_at_tinies negeyari outdoors smiling " + "snake_girl white_hair", + "width": 0 + + + }, + }), + ("https://tentaclerape.net/post/view/10", { + "pattern": r"https://tentaclerape\.net/\./index\.php" + r"\?q=/image/10\.jpg", + "content": "d0fd8f0f6517a76cb5e23ba09f3844950bf2c516", + "keyword": { + "extension": "jpg", + "file_url": "https://tentaclerape.net/./index.php" + "?q=/image/10.jpg", + "filename": "10", + "height": 427, + "id": 10, + "md5": "945db71eeccaef82ce44b77564260c0b", + "size": 0, + "subcategory": "post", + "tags": "Deviant_Art Pet Tentacle artist_sche blonde_hair " + "blouse boots green_eyes highheels leash miniskirt " + "octopus schoolgirl white_skin willing", + "width": 300, + }, + }), + # video + ("https://tentaclerape.net/post/view/91267", { + "pattern": r"https://tentaclerape\.net/\./index\.php" + r"\?q=/image/91267\.mp4", + }), + ("https://booru.cavemanon.xyz/index.php?q=post/view/8335", { + "pattern": r"https://booru\.cavemanon\.xyz" + r"/index\.php\?q=image/8335\.png", + "content": "7158f7e4abbbf143bad5835eb93dbe4d68c1d4ab", + "keyword": { + "extension": "png", + "file_url": "https://booru.cavemanon.xyz" + "/index.php?q=image/8335.png", + "filename": "8335", + "height": 460, + "id": 8335, + "md5": "", + "size": 0, + "tags": "Color Fang", + "width": 459, + }, + }), + ) + + def __init__(self, match): + Shimmie2Extractor.__init__(self, match) + self.post_id = match.group(match.lastindex) + + def posts(self): + url = "{}/post/view/{}".format(self.root, self.post_id) + extr = text.extract_from(self.request(url).text) + + post = { + "id" : self.post_id, + "tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"), + "md5" : extr("/_thumbs/", "/"), + "file_url": self.root + ( + extr("id='main_image' src='", "'") or + extr("").partition( + " ")[0].strip("\"'"), + "size" : 0, + } + + if not post["md5"]: + post["md5"] = text.extr(post["file_url"], "/_images/", "/") + + return (post,) From a96745368e898d4a8aea227ef3e4d1d8cb223dcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 26 Apr 2023 19:31:27 +0200 Subject: [PATCH 034/252] "fix" tests on Python 3.4 and 3.5 can't rely on dict insertion order --- test/test_postprocessor.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index 1630e8a3a5..ac89b55cb9 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -388,14 +388,13 @@ def test_metadata_stdout(self): """) def test_metadata_modify(self): - kwdict = {"foo": 0, "bar": {"bax": 1, "bay": 2, "baz": 3}} + kwdict = {"foo": 0, "bar": {"bax": 1, "bay": 2, "baz": 3, "ba2": {}}} self._create({ "mode": "modify", "fields": { "foo" : "{filename}-{foo!s}", "foo2" : "\fE bar['bax'] + 122", - "bar['baz']" : "{_now}", - "bar[\"ba2\"]" : "\fE {}", + "bar[\"baz\"]" : "{_now}", "bar['ba2'][a]": "test", }, }, kwdict) From 9f76783ac093f4ff90a18a85a2a54d0c6ee70bb9 Mon Sep 17 00:00:00 2001 From: thatfuckingbird <67429906+thatfuckingbird@users.noreply.github.com> Date: Wed, 26 Apr 2023 22:49:29 +0200 Subject: [PATCH 035/252] [pixiv] allow sorting by popularity (requires pixiv premium) --- gallery_dl/extractor/pixiv.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index a17518fedb..9f3acad7a1 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -596,6 +596,9 @@ def metadata(self): sort_map = { "date": "date_asc", "date_d": "date_desc", + "popular_d": "popular_desc", + "popular_male_d": "popular_male_desc", + "popular_female_d": "popular_female_desc", } try: self.sort = sort = sort_map[sort] From 215028a462f09c8deec02d94e423fa1f48c828cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 27 Apr 2023 13:12:11 +0200 Subject: [PATCH 036/252] [manganelo] match more minor version separators (#3972) --- gallery_dl/extractor/manganelo.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py index 5ba18a3e51..6fd9f495e2 100644 --- a/gallery_dl/extractor/manganelo.py +++ b/gallery_dl/extractor/manganelo.py @@ -16,21 +16,26 @@ class ManganeloBase(): category = "manganelo" root = "https://chapmanganato.com" + _match_chapter = None def __init__(self, match): domain, path = match.groups() super().__init__(match, "https://" + domain + path) self.session.headers['Referer'] = self.root - self._match_chapter = re.compile( - r"(?:[Vv]ol\.?\s*(\d+)\s?)?" - r"[Cc]hapter\s*([^:]+)" - r"(?::\s*(.+))?").match + if self._match_chapter is None: + ManganeloBase._match_chapter = re.compile( + r"(?:[Vv]ol\.?\s*(\d+)\s?)?" + r"[Cc]hapter\s*(\d+)([^:]*)" + r"(?::\s*(.+))?").match def _parse_chapter(self, info, manga, author, date=None): match = self._match_chapter(info) - volume, chapter, title = match.groups() if match else ("", "", info) - chapter, sep, minor = chapter.partition(".") + if match: + volume, chapter, minor, title = match.groups() + else: + volume = chapter = minor = "" + title = info return { "manga" : manga, @@ -39,7 +44,7 @@ def _parse_chapter(self, info, manga, author, date=None): "title" : text.unescape(title) if title else "", "volume" : text.parse_int(volume), "chapter" : text.parse_int(chapter), - "chapter_minor": sep + minor, + "chapter_minor": minor, "lang" : "en", "language" : "English", } @@ -61,6 +66,10 @@ class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor): "keyword": "06e01fa9b3fc9b5b954c0d4a98f0153b40922ded", "count": 45, }), + ("https://chapmanganato.com/manga-no991297/chapter-8", { + "keyword": {"chapter": 8, "chapter_minor": "-1"}, + "count": 20, + }), ("https://readmanganato.com/manga-gn983696/chapter-23"), ("https://manganelo.com/chapter/gamers/chapter_15"), ("https://manganelo.com/chapter/gq921227/chapter_23"), From 1b918bd9378ac049bdb03e31b5949f6e6caa42b9 Mon Sep 17 00:00:00 2001 From: Alexandru Vasilescu Date: Fri, 28 Apr 2023 13:13:25 +0300 Subject: [PATCH 037/252] fix(extractor): fix extraction for cross-posted reddit videos and galleries --- gallery_dl/extractor/reddit.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index cefe8d37df..b7260dcf16 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -56,17 +56,26 @@ def items(self): submission["num"] = 0 url = submission["url"] - if url and url.startswith("https://i.redd.it/"): + if not url: + continue + + if url.startswith("https://i.redd.it/"): text.nameext_from_url(url, submission) yield Message.Url, url, submission - elif "gallery_data" in submission: + elif url.startswith("https://www.reddit.com/gallery/"): + submission_with_gallery = submission + if "crosspost_parent_list" in submission_with_gallery: + submission_with_gallery = submission["crosspost_parent_list"][-1] + if "gallery_data" not in submission_with_gallery: + continue + for submission["num"], url in enumerate( - self._extract_gallery(submission), 1): + self._extract_gallery(submission_with_gallery), 1): text.nameext_from_url(url, submission) yield Message.Url, url, submission - elif submission["is_video"]: + elif url.startswith("https://v.redd.it/"): if videos: text.nameext_from_url(url, submission) url = "ytdl:" + self._extract_video(submission) From d4f8b2fe2206afeedf5fa8a2bfa7f6655a135811 Mon Sep 17 00:00:00 2001 From: Alexandru Vasilescu Date: Fri, 28 Apr 2023 13:45:23 +0300 Subject: [PATCH 038/252] fix: linter issues --- gallery_dl/extractor/reddit.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index b7260dcf16..e1f1d27ff3 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -64,14 +64,16 @@ def items(self): yield Message.Url, url, submission elif url.startswith("https://www.reddit.com/gallery/"): - submission_with_gallery = submission - if "crosspost_parent_list" in submission_with_gallery: - submission_with_gallery = submission["crosspost_parent_list"][-1] - if "gallery_data" not in submission_with_gallery: + gallery_submission = submission + if "crosspost_parent_list" in gallery_submission: + gallery_submission = \ + submission["crosspost_parent_list"][-1] + if "gallery_data" not in gallery_submission: continue - for submission["num"], url in enumerate( - self._extract_gallery(submission_with_gallery), 1): + gallery = self._extract_gallery(gallery_submission) + + for submission["num"], url in enumerate(gallery, 1): text.nameext_from_url(url, submission) yield Message.Url, url, submission From 0fb580135d06fca5acbfe718d30573ffbb746c50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 29 Apr 2023 16:18:35 +0200 Subject: [PATCH 039/252] [behance] fix extraction (#3980) --- gallery_dl/extractor/behance.py | 42 +++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index 1469aad96f..95a1726bff 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -81,10 +81,13 @@ class BehanceGalleryExtractor(BehanceExtractor): ("https://www.behance.net/gallery/88276087/Audi-R8-RWD", { "count": 20, "url": "6bebff0d37f85349f9ad28bd8b76fd66627c1e2f", + "pattern": r"https://mir-s3-cdn-cf\.behance\.net/project_modules" + r"/source/[0-9a-f]+.[0-9a-f]+\.jpg" }), # 'video' modules (#1282) ("https://www.behance.net/gallery/101185577/COLCCI", { - "pattern": r"ytdl:https://cdn-prod-ccv\.adobe\.com/", + "pattern": r"https://cdn-prod-ccv\.adobe\.com/\w+" + r"/rend/\w+_720\.mp4\?", "count": 3, }), ) @@ -129,26 +132,35 @@ def get_images(self, data): append = result.append for module in data["modules"]: - mtype = module["type"] + mtype = module["__typename"] - if mtype == "image": - url = module["sizes"]["original"] + if mtype == "ImageModule": + url = module["imageSizes"]["size_original"]["url"] append((url, module)) - elif mtype == "video": - page = self.request(module["src"]).text - url = text.extr(page, ' Date: Sat, 29 Apr 2023 17:25:38 +0200 Subject: [PATCH 040/252] [ytdl] fix crash due to --geo-bypass deprecation (#3975) --- gallery_dl/ytdl.py | 9 ++++++--- test/test_ytdl.py | 10 ++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index b4638b7501..eb09b9b77c 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -409,9 +409,12 @@ def metadataparser_actions(f): "postprocessor_args": opts.postprocessor_args, "cn_verification_proxy": opts.cn_verification_proxy, "geo_verification_proxy": opts.geo_verification_proxy, - "geo_bypass": opts.geo_bypass, - "geo_bypass_country": opts.geo_bypass_country, - "geo_bypass_ip_block": opts.geo_bypass_ip_block, + "geo_bypass": getattr( + opts, "geo_bypass", "default"), + "geo_bypass_country": getattr( + opts, "geo_bypass_country", None), + "geo_bypass_ip_block": getattr( + opts, "geo_bypass_ip_block", None), "compat_opts": compat_opts, } diff --git a/test/test_ytdl.py b/test/test_ytdl.py index 7b82a0f8ee..a28eaeea9d 100644 --- a/test/test_ytdl.py +++ b/test/test_ytdl.py @@ -269,6 +269,16 @@ def test_metadata_from_title(self): "title:%(artist)s - %(title)s")], }) + def test_geo_bypass(self): + self._("--geo-bypass", + "geo_bypass", "default") + self._("--no-geo-bypass", + "geo_bypass", "never") + self._(["--geo-bypass-country", "EN"], + "geo_bypass", "EN") + self._(["--geo-bypass-ip-block", "198.51.100.14/24"], + "geo_bypass", "198.51.100.14/24") + if __name__ == "__main__": unittest.main(warnings="ignore") From 43f4bd9faab190f6692c6275bc339e64b0d92d05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 29 Apr 2023 18:05:45 +0200 Subject: [PATCH 041/252] [ytdl] fix tests tests pass with latest Git HEAD, but not with the current PyPI version --- test/test_ytdl.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/test_ytdl.py b/test/test_ytdl.py index a28eaeea9d..4c20f6711b 100644 --- a/test/test_ytdl.py +++ b/test/test_ytdl.py @@ -235,6 +235,12 @@ def _(self, cmdline, option=util.SENTINEL, expected=None): class Test_CommandlineArguments_YtDlp(Test_CommandlineArguments): module_name = "yt_dlp" + @classmethod + def setUpClass(cls): + super().setUpClass() + if cls.module.version.__version__ > "2023.03.04": + cls.test_geo_bypass = cls._test_geo_bypass_xff + def test_retries_extractor(self): inf = float("inf") @@ -269,7 +275,7 @@ def test_metadata_from_title(self): "title:%(artist)s - %(title)s")], }) - def test_geo_bypass(self): + def _test_geo_bypass_xff(self): self._("--geo-bypass", "geo_bypass", "default") self._("--no-geo-bypass", From 15d7c5a199e929a25cf30c95fd766e64ef85ca94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 30 Apr 2023 13:53:51 +0200 Subject: [PATCH 042/252] [behance] 'items()' -> 'values()' we only need 'size', 'name' is unnecessary --- gallery_dl/extractor/behance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index 95a1726bff..d8cc51d311 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -152,7 +152,7 @@ def get_images(self, data): elif mtype == "MediaCollectionModule": for component in module["components"]: - for name, size in component["imageSizes"].items(): + for size in component["imageSizes"].values(): if size: parts = size["url"].split("/") parts[4] = "source" From 5fb7107f2b3746081684846f5d7417d36b5a086a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 30 Apr 2023 15:20:45 +0200 Subject: [PATCH 043/252] [imxto] fix 'gallery' extraction support both single and double quotes --- gallery_dl/extractor/imagehosts.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 66112a946e..df4ff26556 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -153,8 +153,9 @@ def items(self): "_extractor": ImxtoImageExtractor, "title": text.unescape(title.partition(">")[2]).strip(), } - for url in text.extract_iter(page, ' Date: Sun, 30 Apr 2023 15:35:32 +0200 Subject: [PATCH 044/252] [pixiv] fix 'pixivision' extraction --- gallery_dl/extractor/pixiv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 9f3acad7a1..b70403116e 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -673,7 +673,7 @@ def __init__(self, match): def works(self): return ( - self.api.illust_detail(illust_id) + self.api.illust_detail(illust_id.partition("?")[0]) for illust_id in util.unique_sequence(text.extract_iter( self.page, ' Date: Sun, 30 Apr 2023 18:35:03 +0200 Subject: [PATCH 045/252] release version 1.25.3 --- CHANGELOG.md | 31 +++++++++++++++++++++++++++++++ README.rst | 4 ++-- gallery_dl/version.py | 2 +- 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a67e3abb6d..a76a0dd391 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,36 @@ # Changelog +## 1.25.3 - 2023-04-30 +### Additions +- [imagefap] extract `description` and `categories` metadata ([#3905](https://github.com/mikf/gallery-dl/issues/3905)) +- [imxto] add `gallery` extractor ([#1289](https://github.com/mikf/gallery-dl/issues/1289)) +- [itchio] add `game` extractor ([#3923](https://github.com/mikf/gallery-dl/issues/3923)) +- [nitter] extract user IDs from encoded banner URLs +- [pixiv] allow sorting search results by popularity ([#3970](https://github.com/mikf/gallery-dl/issues/3970)) +- [reddit] match `preview.redd.it` URLs ([#3935](https://github.com/mikf/gallery-dl/issues/3935)) +- [sankaku] support post URLs with MD5 hashes ([#3952](https://github.com/mikf/gallery-dl/issues/3952)) +- [shimmie2] add generic extractors for Shimmie2 sites ([#3734](https://github.com/mikf/gallery-dl/issues/3734), [#943](https://github.com/mikf/gallery-dl/issues/943)) +- [tumblr] add `day` extractor ([#3951](https://github.com/mikf/gallery-dl/issues/3951)) +- [twitter] support `profile-conversation` entries ([#3938](https://github.com/mikf/gallery-dl/issues/3938)) +- [vipergirls] add `thread` and `post` extractors ([#3812](https://github.com/mikf/gallery-dl/issues/3812), [#2720](https://github.com/mikf/gallery-dl/issues/2720), [#731](https://github.com/mikf/gallery-dl/issues/731)) +- [downloader:http] add `consume-content` option ([#3748](https://github.com/mikf/gallery-dl/issues/3748)) +### Fixes +- [2chen] update domain to sturdychan.help +- [behance] fix extraction ([#3980](https://github.com/mikf/gallery-dl/issues/3980)) +- [deviantart] retry downloads with private token ([#3941](https://github.com/mikf/gallery-dl/issues/3941)) +- [imagefap] fix empty `tags` metadata +- [manganelo] support arbitrary minor version separators ([#3972](https://github.com/mikf/gallery-dl/issues/3972)) +- [nozomi] fix file URLs ([#3925](https://github.com/mikf/gallery-dl/issues/3925)) +- [oauth] catch exceptions from `webbrowser.get()` ([#3947](https://github.com/mikf/gallery-dl/issues/3947)) +- [pixiv] fix `pixivision` extraction +- [reddit] ignore `id-max` value `"zik0zj"`/`2147483647` ([#3939](https://github.com/mikf/gallery-dl/issues/3939), [#3862](https://github.com/mikf/gallery-dl/issues/3862), [#3697](https://github.com/mikf/gallery-dl/issues/3697), [#3606](https://github.com/mikf/gallery-dl/issues/3606), [#3546](https://github.com/mikf/gallery-dl/issues/3546), [#3521](https://github.com/mikf/gallery-dl/issues/3521), [#3412](https://github.com/mikf/gallery-dl/issues/3412)) +- [sankaku] sanitize `date:` tags ([#1790](https://github.com/mikf/gallery-dl/issues/1790)) +- [tumblr] fix and update pagination logic ([#2191](https://github.com/mikf/gallery-dl/issues/2191)) +- [twitter] fix `user` metadata when downloading quoted Tweets ([#3922](https://github.com/mikf/gallery-dl/issues/3922)) +- [ytdl] fix crash due to `--geo-bypass` deprecation ([#3975](https://github.com/mikf/gallery-dl/issues/3975)) +- [postprocessor:metadata] support putting keys in quotes +- include more optional dependencies in executables ([#3907](https://github.com/mikf/gallery-dl/issues/3907)) + ## 1.25.2 - 2023-04-15 ### Additions - [deviantart] add `public` option diff --git a/README.rst b/README.rst index 2ca566210a..1f4b692680 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/gallery_dl/version.py b/gallery_dl/version.py index b698a01b25..d4ef532d4b 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.3-dev" +__version__ = "1.25.3" From aa731c429895280ad0be0588a2baa55876edf8e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 1 May 2023 16:31:26 +0200 Subject: [PATCH 046/252] [ytdl] run yt-dlp tests with latest code from master (#3989) Only use PyPI version for Python 3.6, since that's no longer supported by the current codebase. --- .github/workflows/tests.yml | 19 ++++++++++++++++--- gallery_dl/version.py | 2 +- test/test_ytdl.py | 11 ++++++++++- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6f9b317f3c..a72761d4c2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -26,13 +26,26 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies - env: - PYV: ${{ matrix.python-version }} run: | pip install -r requirements.txt pip install "flake8<4" "importlib-metadata<5" pip install youtube-dl - if [[ "$PYV" != "3.4" && "$PYV" != "3.5" ]]; then pip install yt-dlp; fi + + - name: Install yt-dlp + run: | + case "${{ matrix.python-version }}" in + 3.4|3.5) + # don't install yt-dlp + ;; + 3.6) + # install from PyPI + pip install yt-dlp + ;; + *) + # install from master + pip install https://github.com/yt-dlp/yt-dlp/archive/refs/heads/master.tar.gz + ;; + esac - name: Lint with flake8 run: | diff --git a/gallery_dl/version.py b/gallery_dl/version.py index d4ef532d4b..0f6b1dff0c 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.3" +__version__ = "1.25.4-dev" diff --git a/test/test_ytdl.py b/test/test_ytdl.py index 4c20f6711b..54fa467b5c 100644 --- a/test/test_ytdl.py +++ b/test/test_ytdl.py @@ -238,7 +238,7 @@ class Test_CommandlineArguments_YtDlp(Test_CommandlineArguments): @classmethod def setUpClass(cls): super().setUpClass() - if cls.module.version.__version__ > "2023.03.04": + if cls.module.version.__version__ > "2022.07.18": # last 3.6 release cls.test_geo_bypass = cls._test_geo_bypass_xff def test_retries_extractor(self): @@ -276,6 +276,15 @@ def test_metadata_from_title(self): }) def _test_geo_bypass_xff(self): + self._(["--xff", "default"], + "geo_bypass", "default") + self._(["--xff", "never"], + "geo_bypass", "never") + self._(["--xff", "EN"], + "geo_bypass", "EN") + self._(["--xff", "198.51.100.14/24"], + "geo_bypass", "198.51.100.14/24") + self._("--geo-bypass", "geo_bypass", "default") self._("--no-geo-bypass", From df11214281f4815a7638618a3e8e8f06b9ca576a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 1 May 2023 18:26:37 +0200 Subject: [PATCH 047/252] [ytdl] improve --xff/--geo-bypass detection (#3989) check if --xff is supported in a try-except block and select expected results accordingly --- test/test_ytdl.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test/test_ytdl.py b/test/test_ytdl.py index 54fa467b5c..878ac85b0d 100644 --- a/test/test_ytdl.py +++ b/test/test_ytdl.py @@ -235,12 +235,6 @@ def _(self, cmdline, option=util.SENTINEL, expected=None): class Test_CommandlineArguments_YtDlp(Test_CommandlineArguments): module_name = "yt_dlp" - @classmethod - def setUpClass(cls): - super().setUpClass() - if cls.module.version.__version__ > "2022.07.18": # last 3.6 release - cls.test_geo_bypass = cls._test_geo_bypass_xff - def test_retries_extractor(self): inf = float("inf") @@ -275,7 +269,13 @@ def test_metadata_from_title(self): "title:%(artist)s - %(title)s")], }) - def _test_geo_bypass_xff(self): + def test_geo_bypass(self): + try: + ytdl.parse_command_line(self.module, ["--xff", "default"]) + except Exception: + # before --xff (c16644642) + return Test_CommandlineArguments.test_geo_bypass(self) + self._(["--xff", "default"], "geo_bypass", "default") self._(["--xff", "never"], From 57cf942bb1594b0dbf7068ff59fe3d334d3bccf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 2 May 2023 17:49:02 +0200 Subject: [PATCH 048/252] [config] include exception type in error message --- gallery_dl/config.py | 3 ++- gallery_dl/version.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/gallery_dl/config.py b/gallery_dl/config.py index d014293e87..29d0155fc7 100644 --- a/gallery_dl/config.py +++ b/gallery_dl/config.py @@ -102,7 +102,8 @@ def load(files=None, strict=False, load=util.json_loads): log.error(exc) sys.exit(1) except Exception as exc: - log.warning("Could not parse '%s': %s", path, exc) + log.error("%s when loading '%s': %s", + exc.__class__.__name__, path, exc) if strict: sys.exit(2) else: diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 0f6b1dff0c..39cfbd1c5d 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.4-dev" +__version__ = "1.26.0-dev" From ef4e2d8178aaea10fcd3a060d82f63737ae9f499 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 2 May 2023 19:23:26 +0200 Subject: [PATCH 049/252] [foolfuuka] remove 'archive.alice.al' --- docs/supportedsites.md | 6 ------ gallery_dl/extractor/foolfuuka.py | 12 +----------- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ebdc666117..a54eaa3b2d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1415,12 +1415,6 @@ Consider all sites to be NSFW unless otherwise known. Boards, Galleries, Search Results, Threads - - Rozen Arcana - https://archive.alice.al/ - Boards, Galleries, Search Results, Threads - - TokyoChronos https://www.tokyochronos.net/ diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 4f9a6bf5a6..b2689ab809 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2022 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -88,10 +88,6 @@ def _remote_direct(media): "root": "https://boards.fireden.net", "pattern": r"boards\.fireden\.net", }, - "rozenarcana": { - "root": "https://archive.alice.al", - "pattern": r"(?:archive\.)?alice\.al", - }, "tokyochronos": { "root": "https://www.tokyochronos.net", "pattern": r"(?:www\.)?tokyochronos\.net", @@ -137,9 +133,6 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): ("https://boards.fireden.net/sci/thread/11264294/", { "url": "61cab625c95584a12a30049d054931d64f8d20aa", }), - ("https://archive.alice.al/c/thread/2849220/", { - "url": "632e2c8de05de6b3847685f4bf1b4e5c6c9e0ed5", - }), ("https://www.tokyochronos.net/a/thread/241664141/", { "url": "ae03852cf44e3dcfce5be70274cb1828e1dbb7d6", }), @@ -187,7 +180,6 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor): ("https://arch.b4k.co/meta/"), ("https://desuarchive.org/a/"), ("https://boards.fireden.net/sci/"), - ("https://archive.alice.al/c/"), ("https://www.tokyochronos.net/a/"), ("https://rbt.asia/g/"), ("https://thebarchive.com/b/"), @@ -231,7 +223,6 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): ("https://archiveofsins.com/_/search/text/test/"), ("https://desuarchive.org/_/search/text/test/"), ("https://boards.fireden.net/_/search/text/test/"), - ("https://archive.alice.al/_/search/text/test/"), ("https://www.tokyochronos.net/_/search/text/test/"), ("https://rbt.asia/_/search/text/test/"), ("https://thebarchive.com/_/search/text/test/"), @@ -297,7 +288,6 @@ class FoolfuukaGalleryExtractor(FoolfuukaExtractor): ("https://arch.b4k.co/meta/gallery/"), ("https://desuarchive.org/a/gallery/5"), ("https://boards.fireden.net/sci/gallery/6"), - ("https://archive.alice.al/c/gallery/7"), ("https://www.tokyochronos.net/a/gallery/7"), ("https://rbt.asia/g/gallery/8"), ("https://thebarchive.com/b/gallery/9"), From 1870df8b23224dfaefcd4724989d1993c0a69a1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 2 May 2023 19:25:50 +0200 Subject: [PATCH 050/252] [foolfuuka] remove 'tokyochronos.net' --- docs/supportedsites.md | 6 ------ gallery_dl/extractor/foolfuuka.py | 10 ---------- scripts/supportedsites.py | 1 - 3 files changed, 17 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a54eaa3b2d..9ddb015845 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1415,12 +1415,6 @@ Consider all sites to be NSFW unless otherwise known. Boards, Galleries, Search Results, Threads - - TokyoChronos - https://www.tokyochronos.net/ - Boards, Galleries, Search Results, Threads - - RebeccaBlackTech https://rbt.asia/ diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index b2689ab809..13495b54aa 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -88,10 +88,6 @@ def _remote_direct(media): "root": "https://boards.fireden.net", "pattern": r"boards\.fireden\.net", }, - "tokyochronos": { - "root": "https://www.tokyochronos.net", - "pattern": r"(?:www\.)?tokyochronos\.net", - }, "rbt": { "root": "https://rbt.asia", "pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)", @@ -133,9 +129,6 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): ("https://boards.fireden.net/sci/thread/11264294/", { "url": "61cab625c95584a12a30049d054931d64f8d20aa", }), - ("https://www.tokyochronos.net/a/thread/241664141/", { - "url": "ae03852cf44e3dcfce5be70274cb1828e1dbb7d6", - }), ("https://rbt.asia/g/thread/61487650/", { "url": "fadd274b25150a1bdf03a40c58db320fa3b617c4", }), @@ -180,7 +173,6 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor): ("https://arch.b4k.co/meta/"), ("https://desuarchive.org/a/"), ("https://boards.fireden.net/sci/"), - ("https://www.tokyochronos.net/a/"), ("https://rbt.asia/g/"), ("https://thebarchive.com/b/"), ) @@ -223,7 +215,6 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): ("https://archiveofsins.com/_/search/text/test/"), ("https://desuarchive.org/_/search/text/test/"), ("https://boards.fireden.net/_/search/text/test/"), - ("https://www.tokyochronos.net/_/search/text/test/"), ("https://rbt.asia/_/search/text/test/"), ("https://thebarchive.com/_/search/text/test/"), ) @@ -288,7 +279,6 @@ class FoolfuukaGalleryExtractor(FoolfuukaExtractor): ("https://arch.b4k.co/meta/gallery/"), ("https://desuarchive.org/a/gallery/5"), ("https://boards.fireden.net/sci/gallery/6"), - ("https://www.tokyochronos.net/a/gallery/7"), ("https://rbt.asia/g/gallery/8"), ("https://thebarchive.com/b/gallery/9"), ) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 1ebebddcf3..ebeac1c21a 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -117,7 +117,6 @@ "thatpervert" : "ThatPervert", "thebarchive" : "The /b/ Archive", "thecollection" : "The /co/llection", - "tokyochronos" : "TokyoChronos", "tumblrgallery" : "TumblrGallery", "vanillarock" : "もえぴりあ", "vidyart" : "/v/idyart", From a08fdfac6e0a04f2a39b05481cf73964d5e887ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 2 May 2023 19:58:55 +0200 Subject: [PATCH 051/252] [foolfuuka] add 'archive.palanq.win' --- docs/supportedsites.md | 6 ++++++ gallery_dl/extractor/foolfuuka.py | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 9ddb015845..355b6369ab 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1415,6 +1415,12 @@ Consider all sites to be NSFW unless otherwise known. Boards, Galleries, Search Results, Threads + + Palanq + https://archive.palanq.win/ + Boards, Galleries, Search Results, Threads + + RebeccaBlackTech https://rbt.asia/ diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 13495b54aa..76fb69ebb4 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -88,6 +88,10 @@ def _remote_direct(media): "root": "https://boards.fireden.net", "pattern": r"boards\.fireden\.net", }, + "palanq": { + "root": "https://archive.palanq.win", + "pattern": r"archive\.palanq\.win", + }, "rbt": { "root": "https://rbt.asia", "pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)", @@ -129,6 +133,9 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): ("https://boards.fireden.net/sci/thread/11264294/", { "url": "61cab625c95584a12a30049d054931d64f8d20aa", }), + ("https://archive.palanq.win/c/thread/4209598/", { + "url": "1f9b5570d228f1f2991c827a6631030bc0e5933c", + }), ("https://rbt.asia/g/thread/61487650/", { "url": "fadd274b25150a1bdf03a40c58db320fa3b617c4", }), @@ -173,6 +180,7 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor): ("https://arch.b4k.co/meta/"), ("https://desuarchive.org/a/"), ("https://boards.fireden.net/sci/"), + ("https://archive.palanq.win/c/"), ("https://rbt.asia/g/"), ("https://thebarchive.com/b/"), ) @@ -215,6 +223,7 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): ("https://archiveofsins.com/_/search/text/test/"), ("https://desuarchive.org/_/search/text/test/"), ("https://boards.fireden.net/_/search/text/test/"), + ("https://archive.palanq.win/_/search/text/test/"), ("https://rbt.asia/_/search/text/test/"), ("https://thebarchive.com/_/search/text/test/"), ) @@ -279,6 +288,7 @@ class FoolfuukaGalleryExtractor(FoolfuukaExtractor): ("https://arch.b4k.co/meta/gallery/"), ("https://desuarchive.org/a/gallery/5"), ("https://boards.fireden.net/sci/gallery/6"), + ("https://archive.palanq.win/c/gallery"), ("https://rbt.asia/g/gallery/8"), ("https://thebarchive.com/b/gallery/9"), ) From 0c46758a934ca084e0ff0a1d4bff7906e0a369ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 2 May 2023 20:09:04 +0200 Subject: [PATCH 052/252] [foolslide] remove 'sensescans.com' group moved to mangadex https://mangadex.org/group/1071e71d-cc55-4fa6-81d1-4b5913a2fde5/sense-scans --- docs/supportedsites.md | 6 ------ gallery_dl/extractor/foolslide.py | 13 ------------- 2 files changed, 19 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 355b6369ab..811dcc9e77 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1443,12 +1443,6 @@ Consider all sites to be NSFW unless otherwise known. Chapters, Manga - - Sense-Scans - https://sensescans.com/reader/ - Chapters, Manga - - Mastodon Instances diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index 4a38fb4fcc..57d37b7688 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -42,11 +42,6 @@ def parse_chapter_url(url, data): "root": "https://read.powermanga.org", "pattern": r"read(?:er)?\.powermanga\.org", }, - "sensescans": { - "root": "https://sensescans.com/reader", - "pattern": r"(?:(?:www\.)?sensescans\.com/reader" - r"|reader\.sensescans\.com)", - }, }) @@ -64,11 +59,6 @@ class FoolslideChapterExtractor(FoolslideExtractor): "url": "854c5817f8f767e1bccd05fa9d58ffb5a4b09384", "keyword": "a60c42f2634b7387899299d411ff494ed0ad6dbe", }), - ("https://sensescans.com/reader/read/ao_no_orchestra/en/0/26/", { - "url": "bbd428dc578f5055e9f86ad635b510386cd317cd", - "keyword": "083ef6f8831c84127fe4096fa340a249be9d1424", - }), - ("https://reader.sensescans.com/read/ao_no_orchestra/en/0/26/"), ) def items(self): @@ -129,9 +119,6 @@ class FoolslideMangaExtractor(FoolslideExtractor): "volume": int, }, }), - ("https://sensescans.com/reader/series/yotsubato/", { - "count": ">= 3", - }), ) def items(self): From 076380e079e9764b79c6add737a281f39d0f7cc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 2 May 2023 22:16:58 +0200 Subject: [PATCH 053/252] remove '*' indicating keyword-only arguments they are kind of unnecessary and cause a non-insignificant function call overhead (~10%) --- gallery_dl/config.py | 14 +++++++------- gallery_dl/cookies.py | 7 +++---- gallery_dl/extractor/common.py | 8 ++++---- gallery_dl/extractor/oauth.py | 2 +- 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/gallery_dl/config.py b/gallery_dl/config.py index 29d0155fc7..0b2aca8066 100644 --- a/gallery_dl/config.py +++ b/gallery_dl/config.py @@ -119,7 +119,7 @@ def clear(): _config.clear() -def get(path, key, default=None, *, conf=_config): +def get(path, key, default=None, conf=_config): """Get the value of property 'key' or a default value""" try: for p in path: @@ -129,7 +129,7 @@ def get(path, key, default=None, *, conf=_config): return default -def interpolate(path, key, default=None, *, conf=_config): +def interpolate(path, key, default=None, conf=_config): """Interpolate the value of 'key'""" if key in conf: return conf[key] @@ -143,7 +143,7 @@ def interpolate(path, key, default=None, *, conf=_config): return default -def interpolate_common(common, paths, key, default=None, *, conf=_config): +def interpolate_common(common, paths, key, default=None, conf=_config): """Interpolate the value of 'key' using multiple 'paths' along a 'common' ancestor """ @@ -175,7 +175,7 @@ def interpolate_common(common, paths, key, default=None, *, conf=_config): return default -def accumulate(path, key, *, conf=_config): +def accumulate(path, key, conf=_config): """Accumulate the values of 'key' along 'path'""" result = [] try: @@ -194,7 +194,7 @@ def accumulate(path, key, *, conf=_config): return result -def set(path, key, value, *, conf=_config): +def set(path, key, value, conf=_config): """Set the value of property 'key' for this session""" for p in path: try: @@ -204,7 +204,7 @@ def set(path, key, value, *, conf=_config): conf[key] = value -def setdefault(path, key, value, *, conf=_config): +def setdefault(path, key, value, conf=_config): """Set the value of property 'key' if it doesn't exist""" for p in path: try: @@ -214,7 +214,7 @@ def setdefault(path, key, value, *, conf=_config): return conf.setdefault(key, value) -def unset(path, key, *, conf=_config): +def unset(path, key, conf=_config): """Unset the value of property 'key'""" try: for p in path: diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 3d715a7704..78e73bfea2 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -412,18 +412,17 @@ def cookie_counts(self): raise NotImplementedError("Must be implemented by sub classes") -def get_cookie_decryptor(browser_root, browser_keyring_name, *, keyring=None): +def get_cookie_decryptor(browser_root, browser_keyring_name, keyring=None): if sys.platform in ("win32", "cygwin"): return WindowsChromeCookieDecryptor(browser_root) elif sys.platform == "darwin": return MacChromeCookieDecryptor(browser_keyring_name) else: - return LinuxChromeCookieDecryptor( - browser_keyring_name, keyring=keyring) + return LinuxChromeCookieDecryptor(browser_keyring_name, keyring) class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): - def __init__(self, browser_keyring_name, *, keyring=None): + def __init__(self, browser_keyring_name, keyring=None): self._v10_key = self.derive_key(b"peanuts") password = _get_linux_keyring_password(browser_keyring_name, keyring) self._v11_key = None if password is None else self.derive_key(password) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 8024be9fa3..8e3a9a9b77 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -106,7 +106,7 @@ def _config_shared_accumulate(self, key): values[:0] = config.accumulate((self.subcategory,), key, conf=conf) return values - def request(self, url, *, method="GET", session=None, + def request(self, url, method="GET", session=None, retries=None, retry_codes=None, encoding=None, fatal=True, notfound=None, **kwargs): if session is None: @@ -180,7 +180,7 @@ def request(self, url, *, method="GET", session=None, raise exception.HttpError(msg, response) - def wait(self, *, seconds=None, until=None, adjust=1.0, + def wait(self, seconds=None, until=None, adjust=1.0, reason="rate limit reset"): now = time.time() @@ -371,7 +371,7 @@ def _store_cookies(self): except OSError as exc: self.log.warning("cookies: %s", exc) - def _update_cookies(self, cookies, *, domain=""): + def _update_cookies(self, cookies, domain=""): """Update the session's cookiejar with 'cookies'""" if isinstance(cookies, dict): self._update_cookies_dict(cookies, domain or self.cookiedomain) @@ -391,7 +391,7 @@ def _update_cookies_dict(self, cookiedict, domain): for name, value in cookiedict.items(): setcookie(name, value, domain=domain) - def _check_cookies(self, cookienames, *, domain=None): + def _check_cookies(self, cookienames, domain=None): """Check if all 'cookienames' are in the session's cookiejar""" if not self._cookiejar: return False diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 404f296d24..824757ce9b 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -134,7 +134,7 @@ def _oauth1_authorization_flow( def _oauth2_authorization_code_grant( self, client_id, client_secret, default_id, default_secret, - auth_url, token_url, *, scope="read", duration="permanent", + auth_url, token_url, scope="read", duration="permanent", key="refresh_token", auth=True, cache=None, instance=None): """Perform an OAuth2 authorization code grant""" From 7499fa70754cfa4cbe9d4df8b039a4a322ccb985 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 3 May 2023 17:39:49 +0200 Subject: [PATCH 054/252] [exhentai] remove and update sad panda check there hasn't been a sad panda in several years --- gallery_dl/extractor/exhentai.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index dccc74e401..9cd7ae4e67 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2022 Mike Fährmann +# Copyright 2014-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -21,8 +21,7 @@ class ExhentaiExtractor(Extractor): """Base class for exhentai extractors""" category = "exhentai" directory_fmt = ("{category}", "{gid} {title[:247]}") - filename_fmt = ( - "{gid}_{num:>04}_{image_token}_{filename}.{extension}") + filename_fmt = "{gid}_{num:>04}_{image_token}_{filename}.{extension}" archive_fmt = "{gid}_{num}" cookienames = ("ipb_member_id", "ipb_pass_hash") cookiedomain = ".exhentai.org" @@ -56,10 +55,10 @@ def __init__(self, match): if version != "ex": self.session.cookies.set("nw", "1", domain=self.cookiedomain) - def request(self, *args, **kwargs): - response = Extractor.request(self, *args, **kwargs) - if self._is_sadpanda(response): - self.log.info("sadpanda.jpg") + def request(self, url, **kwargs): + response = Extractor.request(self, url, **kwargs) + if response.history and response.headers.get("Content-Length") == "0": + self.log.info("blank page") raise exception.AuthorizationError() return response @@ -100,14 +99,6 @@ def _login_impl(self, username, password): raise exception.AuthenticationError() return {c: response.cookies[c] for c in self.cookienames} - @staticmethod - def _is_sadpanda(response): - """Return True if the response object contains a sad panda""" - return ( - response.headers.get("Content-Length") == "9615" and - "sadpanda.jpg" in response.headers.get("Content-Disposition", "") - ) - class ExhentaiGalleryExtractor(ExhentaiExtractor): """Extractor for image galleries from exhentai.org""" From 0e74df1de8b5009d8ffe932f00facea1226a6f64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 3 May 2023 17:46:21 +0200 Subject: [PATCH 055/252] [420chan] remove module offline since 2022-06-01 --- docs/supportedsites.md | 6 --- gallery_dl/extractor/420chan.py | 76 -------------------------------- gallery_dl/extractor/__init__.py | 1 - 3 files changed, 83 deletions(-) delete mode 100644 gallery_dl/extractor/420chan.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 811dcc9e77..89b01045b5 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -31,12 +31,6 @@ Consider all sites to be NSFW unless otherwise known. Pools, Popular Images, Posts, Tag Searches - - 420chan - https://420chan.org/ - Boards, Threads - - 4chan https://www.4chan.org/ diff --git a/gallery_dl/extractor/420chan.py b/gallery_dl/extractor/420chan.py deleted file mode 100644 index fd0172e082..0000000000 --- a/gallery_dl/extractor/420chan.py +++ /dev/null @@ -1,76 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2021 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://420chan.org/""" - -from .common import Extractor, Message - - -class _420chanThreadExtractor(Extractor): - """Extractor for 420chan threads""" - category = "420chan" - subcategory = "thread" - directory_fmt = ("{category}", "{board}", "{thread} {title}") - archive_fmt = "{board}_{thread}_{filename}" - pattern = r"(?:https?://)?boards\.420chan\.org/([^/?#]+)/thread/(\d+)" - test = ("https://boards.420chan.org/ani/thread/33251/chow-chows", { - "pattern": r"https://boards\.420chan\.org/ani/src/\d+\.jpg", - "content": "b07c803b0da78de159709da923e54e883c100934", - "count": 2, - }) - - def __init__(self, match): - Extractor.__init__(self, match) - self.board, self.thread = match.groups() - - def items(self): - url = "https://api.420chan.org/{}/res/{}.json".format( - self.board, self.thread) - posts = self.request(url).json()["posts"] - - data = { - "board" : self.board, - "thread": self.thread, - "title" : posts[0].get("sub") or posts[0]["com"][:50], - } - - yield Message.Directory, data - for post in posts: - if "filename" in post: - post.update(data) - post["extension"] = post["ext"][1:] - url = "https://boards.420chan.org/{}/src/{}{}".format( - post["board"], post["filename"], post["ext"]) - yield Message.Url, url, post - - -class _420chanBoardExtractor(Extractor): - """Extractor for 420chan boards""" - category = "420chan" - subcategory = "board" - pattern = r"(?:https?://)?boards\.420chan\.org/([^/?#]+)/\d*$" - test = ("https://boards.420chan.org/po/", { - "pattern": _420chanThreadExtractor.pattern, - "count": ">= 100", - }) - - def __init__(self, match): - Extractor.__init__(self, match) - self.board = match.group(1) - - def items(self): - url = "https://api.420chan.org/{}/threads.json".format(self.board) - threads = self.request(url).json() - - for page in threads: - for thread in page["threads"]: - url = "https://boards.420chan.org/{}/thread/{}/".format( - self.board, thread["no"]) - thread["page"] = page["page"] - thread["_extractor"] = _420chanThreadExtractor - yield Message.Queue, url, thread diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9841ca7d63..7d1d819dd5 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -14,7 +14,6 @@ "2chen", "35photo", "3dbooru", - "420chan", "4chan", "500px", "8chan", From 79f47f98dd4167c357ba3c7c113f7ff52b0eec1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 3 May 2023 18:02:58 +0200 Subject: [PATCH 056/252] [nana] remove module permanently gone since 2023-03-13 --- docs/configuration.rst | 11 --- docs/gallery-dl.conf | 4 -- docs/supportedsites.md | 6 -- gallery_dl/extractor/__init__.py | 1 - gallery_dl/extractor/nana.py | 118 ------------------------------- 5 files changed, 140 deletions(-) delete mode 100644 gallery_dl/extractor/nana.py diff --git a/docs/configuration.rst b/docs/configuration.rst index 53c5d69517..651275c531 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2074,17 +2074,6 @@ Description Fetch media from replies to other notes. -extractor.nana.favkey ---------------------- -Type - ``string`` -Default - ``null`` -Description - Your `Nana Favorite Key `__, - used to access your favorite archives. - - extractor.newgrounds.flash -------------------------- Type diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 09d9e80aa8..92451fda16 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -201,10 +201,6 @@ "format": "original", "include": "art" }, - "nana": - { - "favkey": null - }, "nijie": { "username": null, diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 89b01045b5..3d1ac014fe 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -541,12 +541,6 @@ Consider all sites to be NSFW unless otherwise known. Galleries - - Nana - https://nana.my.id/ - Galleries, Favorites, Search Results - - Naver https://blog.naver.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 7d1d819dd5..1828583cfc 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -96,7 +96,6 @@ "misskey", "myhentaigallery", "myportfolio", - "nana", "naver", "naverwebtoon", "newgrounds", diff --git a/gallery_dl/extractor/nana.py b/gallery_dl/extractor/nana.py deleted file mode 100644 index 24e676fed9..0000000000 --- a/gallery_dl/extractor/nana.py +++ /dev/null @@ -1,118 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://nana.my.id/""" - -from .common import GalleryExtractor, Extractor, Message -from .. import text, util, exception - - -class NanaGalleryExtractor(GalleryExtractor): - """Extractor for image galleries from nana.my.id""" - category = "nana" - directory_fmt = ("{category}", "{title}") - pattern = r"(?:https?://)?nana\.my\.id/reader/([^/?#]+)" - test = ( - (("https://nana.my.id/reader/" - "059f7de55a4297413bfbd432ce7d6e724dd42bae"), { - "pattern": r"https://nana\.my\.id/reader/" - r"\w+/image/page\?path=.*\.\w+", - "keyword": { - "title" : "Everybody Loves Shion", - "artist": "fuzui", - "tags" : list, - "count" : 29, - }, - }), - (("https://nana.my.id/reader/" - "77c8712b67013e427923573379f5bafcc0c72e46"), { - "pattern": r"https://nana\.my\.id/reader/" - r"\w+/image/page\?path=.*\.\w+", - "keyword": { - "title" : "Lovey-Dovey With an Otaku-Friendly Gyaru", - "artist": "Sueyuu", - "tags" : ["Sueyuu"], - "count" : 58, - }, - }), - ) - - def __init__(self, match): - self.gallery_id = match.group(1) - url = "https://nana.my.id/reader/" + self.gallery_id - GalleryExtractor.__init__(self, match, url) - - def metadata(self, page): - title = text.unescape( - text.extr(page, '  ', '')) - artist = text.unescape(text.extr( - page, '', ''))[len(title):-10] - tags = text.extr(page, 'Reader.tags = "', '"') - - return { - "gallery_id": self.gallery_id, - "title" : title, - "artist" : artist[4:] if artist.startswith(" by ") else "", - "tags" : tags.split(", ") if tags else (), - "lang" : "en", - "language" : "English", - } - - def images(self, page): - data = util.json_loads(text.extr(page, "Reader.pages = ", ".pages")) - return [ - ("https://nana.my.id" + image, None) - for image in data["pages"] - ] - - -class NanaSearchExtractor(Extractor): - """Extractor for nana search results""" - category = "nana" - subcategory = "search" - pattern = r"(?:https?://)?nana\.my\.id(?:/?\?([^#]+))" - test = ( - ('https://nana.my.id/?q=+"elf"&sort=desc', { - "pattern": NanaGalleryExtractor.pattern, - "range": "1-100", - "count": 100, - }), - ("https://nana.my.id/?q=favorites%3A", { - "pattern": NanaGalleryExtractor.pattern, - "count": ">= 2", - }), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.params = text.parse_query(match.group(1)) - self.params["p"] = text.parse_int(self.params.get("p"), 1) - self.params["q"] = self.params.get("q") or "" - - def items(self): - if "favorites:" in self.params["q"]: - favkey = self.config("favkey") - if not favkey: - raise exception.AuthenticationError( - "'Favorite key' not provided. " - "Please see 'https://nana.my.id/tutorial'") - self.session.cookies.set("favkey", favkey, domain="nana.my.id") - - data = {"_extractor": NanaGalleryExtractor} - while True: - try: - page = self.request( - "https://nana.my.id", params=self.params).text - except exception.HttpError: - return - - for gallery in text.extract_iter( - page, '
    ', '
    '): - url = "https://nana.my.id" + text.extr( - gallery, ' Date: Wed, 3 May 2023 18:41:09 +0200 Subject: [PATCH 057/252] [pinterest] update endpoint for related board pins --- gallery_dl/extractor/pinterest.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 31ddbcc892..96523ed35b 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -121,7 +121,7 @@ class PinterestPinExtractor(PinterestExtractor): }), # video pin (#1189) ("https://www.pinterest.com/pin/422564377542934214/", { - "pattern": r"https://v\.pinimg\.com/videos/mc/hls/d7/22/ff" + "pattern": r"https://v\d*\.pinimg\.com/videos/mc/hls/d7/22/ff" r"/d722ff00ab2352981b89974b37909de8.m3u8", }), ("https://www.pinterest.com/pin/858146903966145188/", { @@ -248,7 +248,7 @@ class PinterestCreatedExtractor(PinterestExtractor): pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/_created/?$" test = ("https://www.pinterest.de/digitalmomblog/_created/", { "pattern": r"https://i\.pinimg\.com/originals/[0-9a-f]{2}" - r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.jpg", + r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.(jpg|png)", "count": 10, "range": "1-10", }) @@ -348,7 +348,7 @@ class PinterestRelatedBoardExtractor(PinterestBoardExtractor): }) def pins(self): - return self.api.board_related(self.board["id"]) + return self.api.board_content_recommendation(self.board["id"]) class PinterestPinitExtractor(PinterestExtractor): @@ -458,10 +458,10 @@ def board_section_pins(self, section_id): options = {"section_id": section_id} return self._pagination("BoardSectionPins", options) - def board_related(self, board_id): + def board_content_recommendation(self, board_id): """Yield related pins of a specific board""" - options = {"board_id": board_id, "add_vase": True} - return self._pagination("BoardRelatedPixieFeed", options) + options = {"id": board_id, "type": "board", "add_vase": True} + return self._pagination("BoardContentRecommendation", options) def user_pins(self, user): """Yield all pins from 'user'""" From 4d415376d17bc1739a5ded5db7f7ad6dae51abc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 3 May 2023 18:53:56 +0200 Subject: [PATCH 058/252] [pinterest] fix 'pin.it' extractor it really was just the single '/' at the end of the url_shortener URL --- gallery_dl/extractor/pinterest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 96523ed35b..24c0a06321 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -370,7 +370,7 @@ def __init__(self, match): self.shortened_id = match.group(1) def items(self): - url = "https://api.pinterest.com/url_shortener/{}/redirect".format( + url = "https://api.pinterest.com/url_shortener/{}/redirect/".format( self.shortened_id) response = self.request(url, method="HEAD", allow_redirects=False) location = response.headers.get("Location") From 850df34c316568daa8353f9410d5471fbad2f741 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 3 May 2023 20:26:25 +0200 Subject: [PATCH 059/252] remove '&' from URL patterns part 2 follow-up on 968d3e8465d70bf589b87ff79182ee9cae3ce4fb --- gallery_dl/extractor/myportfolio.py | 6 +++--- gallery_dl/extractor/pinterest.py | 20 ++++++++++---------- gallery_dl/extractor/sankakucomplex.py | 4 ++-- gallery_dl/extractor/urlshortener.py | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py index 7d23518b72..fd16f247a9 100644 --- a/gallery_dl/extractor/myportfolio.py +++ b/gallery_dl/extractor/myportfolio.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2022 Mike Fährmann +# Copyright 2018-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://www.myportfolio.com/""" +"""Extractors for https://www.myportfolio.com/""" from .common import Extractor, Message from .. import text, exception @@ -21,7 +21,7 @@ class MyportfolioGalleryExtractor(Extractor): archive_fmt = "{user}_{filename}" pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)|" r"(?:https?://)?([\w-]+\.myportfolio\.com))" - r"(/[^/?&#]+)?") + r"(/[^/?#]+)?") test = ( ("https://andrewling.myportfolio.com/volvo-xc-90-hybrid", { "url": "acea0690c76db0e5cf267648cefd86e921bc3499", diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 24c0a06321..92e0588657 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -112,7 +112,7 @@ def _media_from_pin(pin): class PinterestPinExtractor(PinterestExtractor): """Extractor for images from a single pin from pinterest.com""" subcategory = "pin" - pattern = BASE_PATTERN + r"/pin/([^/?#&]+)(?!.*#related$)" + pattern = BASE_PATTERN + r"/pin/([^/?#]+)(?!.*#related$)" test = ( ("https://www.pinterest.com/pin/858146903966145189/", { "url": "afb3c26719e3a530bb0e871c480882a801a4e8a5", @@ -147,8 +147,8 @@ class PinterestBoardExtractor(PinterestExtractor): subcategory = "board" directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}") archive_fmt = "{board[id]}_{id}" - pattern = (BASE_PATTERN + r"/(?!pin/)([^/?#&]+)" - "/(?!_saved|_created|pins/)([^/?#&]+)/?$") + pattern = (BASE_PATTERN + r"/(?!pin/)([^/?#]+)" + "/(?!_saved|_created|pins/)([^/?#]+)/?$") test = ( ("https://www.pinterest.com/g1952849/test-/", { "pattern": r"https://i\.pinimg\.com/originals/", @@ -198,7 +198,7 @@ def pins(self): class PinterestUserExtractor(PinterestExtractor): """Extractor for a user's boards""" subcategory = "user" - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)(?:/_saved)?/?$" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)(?:/_saved)?/?$" test = ( ("https://www.pinterest.com/g1952849/", { "pattern": PinterestBoardExtractor.pattern, @@ -223,7 +223,7 @@ class PinterestAllpinsExtractor(PinterestExtractor): """Extractor for a user's 'All Pins' feed""" subcategory = "allpins" directory_fmt = ("{category}", "{user}") - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/pins/?$" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/pins/?$" test = ("https://www.pinterest.com/g1952849/pins/", { "pattern": r"https://i\.pinimg\.com/originals/[0-9a-f]{2}" r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.\w{3}", @@ -245,7 +245,7 @@ class PinterestCreatedExtractor(PinterestExtractor): """Extractor for a user's created pins""" subcategory = "created" directory_fmt = ("{category}", "{user}") - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/_created/?$" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/_created/?$" test = ("https://www.pinterest.de/digitalmomblog/_created/", { "pattern": r"https://i\.pinimg\.com/originals/[0-9a-f]{2}" r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.(jpg|png)", @@ -270,7 +270,7 @@ class PinterestSectionExtractor(PinterestExtractor): directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}", "{section[title]}") archive_fmt = "{board[id]}_{id}" - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+)/([^/?#&]+)" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/([^/?#]+)/([^/?#]+)" test = ("https://www.pinterest.com/g1952849/stuff/section", { "count": 2, }) @@ -321,7 +321,7 @@ class PinterestRelatedPinExtractor(PinterestPinExtractor): """Extractor for related pins of another pin from pinterest.com""" subcategory = "related-pin" directory_fmt = ("{category}", "related {original_pin[id]}") - pattern = BASE_PATTERN + r"/pin/([^/?#&]+).*#related$" + pattern = BASE_PATTERN + r"/pin/([^/?#]+).*#related$" test = ("https://www.pinterest.com/pin/858146903966145189/#related", { "range": "31-70", "count": 40, @@ -340,7 +340,7 @@ class PinterestRelatedBoardExtractor(PinterestBoardExtractor): subcategory = "related-board" directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}", "related") - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+)/?#related$" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/([^/?#]+)/?#related$" test = ("https://www.pinterest.com/g1952849/test-/#related", { "range": "31-70", "count": 40, @@ -354,7 +354,7 @@ def pins(self): class PinterestPinitExtractor(PinterestExtractor): """Extractor for images from a pin.it URL""" subcategory = "pinit" - pattern = r"(?:https?://)?pin\.it/([^/?#&]+)" + pattern = r"(?:https?://)?pin\.it/([^/?#]+)" test = ( ("https://pin.it/Hvt8hgT", { diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py index 5d83299357..47e067b208 100644 --- a/gallery_dl/extractor/sankakucomplex.py +++ b/gallery_dl/extractor/sankakucomplex.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -112,7 +112,7 @@ class SankakucomplexTagExtractor(SankakucomplexExtractor): """Extractor for sankakucomplex blog articles by tag or author""" subcategory = "tag" pattern = (r"(?:https?://)?www\.sankakucomplex\.com" - r"/((?:tag|category|author)/[^/&?#]+)") + r"/((?:tag|category|author)/[^/?#]+)") test = ( ("https://www.sankakucomplex.com/tag/cosplay/", { "range": "1-50", diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py index 1a39b5bea9..972b508db1 100644 --- a/gallery_dl/extractor/urlshortener.py +++ b/gallery_dl/extractor/urlshortener.py @@ -34,7 +34,7 @@ class UrlshortenerExtractor(BaseExtractor): class UrlshortenerLinkExtractor(UrlshortenerExtractor): """Extractor for general-purpose URL shorteners""" subcategory = "link" - pattern = BASE_PATTERN + r"/([^/?&#]+)" + pattern = BASE_PATTERN + r"/([^/?#]+)" test = ( ("https://bit.ly/3cWIUgq", { "count": 1, From f0b76e0bb5b61ad3f04313c6723363e4866e332f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 4 May 2023 10:46:38 +0200 Subject: [PATCH 060/252] publish pull request helper script it's what I've been using to manage GitHub pull requests locally --- scripts/pull-request | 55 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100755 scripts/pull-request diff --git a/scripts/pull-request b/scripts/pull-request new file mode 100755 index 0000000000..defdc11fb9 --- /dev/null +++ b/scripts/pull-request @@ -0,0 +1,55 @@ +#!/bin/bash +set -e + +RE="https://github.com/([^/?#]+)/([^/?#]+)(/tree/(.+))?" +if [[ "$1" =~ $RE ]]; then + USER="${BASH_REMATCH[1]}" + REPO="${BASH_REMATCH[2]}" + BRANCH="${BASH_REMATCH[4]:-master}" + +else + echo "invalid github repository identifier: '$1'" + exit 1 + +fi + + +call() { echo "$@"; "$@"; echo; } + +# {x,,} transforms value to lowercase +case "${2,,}" in + +""|"f"|"fetch") + call git remote add "$USER" git@github.com:"$USER"/"$REPO".git || true + call git fetch "$USER" "$BRANCH" + call git checkout -b "$USER-$BRANCH" "$USER/$BRANCH" + ;; + +"m"|"merge") + RE='\s*(.+)\s+#([0-9]+)' + if [[ "$3" =~ $RE ]]; then + TITLE="${BASH_REMATCH[1]}" + PULL="${BASH_REMATCH[2]}" + fi + + call git switch master + call git merge --no-ff --edit -m "merge #${PULL-_}: ${TITLE-_}" "$USER-$BRANCH" + call git branch -d "$USER-$BRANCH" + ;; + +"p"|"push") + call git push "$USER" HEAD:"$BRANCH" + ;; + +"d"|"delete") + call git switch master + call git branch -D "$USER-$BRANCH" + call git remote remove "$USER" + ;; + +*) + echo "invalid action: '$2'" + exit 2 + ;; + +esac From bc6d65d203200c32aec94c5f81778decde0352ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 4 May 2023 10:49:14 +0200 Subject: [PATCH 061/252] implement 'Extractor.config_deprecated()' a version of 'Extractor.config()' that logs a warning when using a deprecated option name --- gallery_dl/extractor/common.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 8e3a9a9b77..78760e0aa2 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -90,6 +90,21 @@ def skip(self, num): def config(self, key, default=None): return config.interpolate(self._cfgpath, key, default) + def config_deprecated(self, key, deprecated, default=None, + sentinel=util.SENTINEL, history=set()): + value = self.config(deprecated, sentinel) + if value is not sentinel: + if deprecated not in history: + history.add(deprecated) + self.log.warning("'%s' is deprecated. Use '%s' instead.", + deprecated, key) + default = value + + value = self.config(key, sentinel) + if value is not sentinel: + return value + return default + def config_accumulate(self, key): return config.accumulate(self._cfgpath, key) From 3ca5dac8b636f954a3517c9507ffa7e61db8542c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 4 May 2023 15:10:47 +0200 Subject: [PATCH 062/252] extend 'cookies-update' functionality Allow writing cookies to a different file than a given cookies.txt, making it possible to export cookies imported with --cookies-from-browser To convert browser cookies to cookies.txt format: gallery-dl --cookies-fr chromium \ -o cookies-update=cookies.txt \ --no-download \ http://example.org/file.jpg --- docs/configuration.rst | 12 ++++++++---- gallery_dl/extractor/common.py | 22 ++++++++++++++++------ 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 651275c531..5077a5e534 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -457,13 +457,17 @@ Description extractor.*.cookies-update -------------------------- Type - ``bool`` + * ``bool`` + * |Path|_ Default ``true`` Description - If `extractor.*.cookies`_ specifies the |Path|_ of a cookies.txt - file and it can be opened and parsed without errors, - update its contents with cookies received during data extraction. + Export session cookies in cookies.txt format. + + * If this is a |Path|_, write cookies to the given file path. + + * If this is ``true`` and `extractor.*.cookies`_ specifies the |Path|_ + of a valid cookies.txt file, update its contents. extractor.*.proxy diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 78760e0aa2..09737ef996 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -379,12 +379,22 @@ def _init_cookies(self): def _store_cookies(self): """Store the session's cookiejar in a cookies.txt file""" - if self._cookiefile and self.config("cookies-update", True): - try: - with open(self._cookiefile, "w") as fp: - util.cookiestxt_store(fp, self._cookiejar) - except OSError as exc: - self.log.warning("cookies: %s", exc) + export = self.config("cookies-update", True) + if not export: + return + + if isinstance(export, str): + path = util.expand_path(export) + else: + path = self._cookiefile + if not path: + return + + try: + with open(path, "w") as fp: + util.cookiestxt_store(fp, self._cookiejar) + except OSError as exc: + self.log.warning("cookies: %s", exc) def _update_cookies(self, cookies, domain=""): """Update the session's cookiejar with 'cookies'""" From a14b63d941e6d8c0c0ba587eeb46acdc45a03ba7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 5 May 2023 21:32:18 +0200 Subject: [PATCH 063/252] support selecting a domain for '--cookies-from-browser' for example 'gallery-dl --cookies-from-browser firefox/twitter.com' --- docs/configuration.rst | 3 +- docs/options.md | 9 +++--- gallery_dl/__init__.py | 4 ++- gallery_dl/cookies.py | 65 ++++++++++++++++++++++++++++++------------ gallery_dl/option.py | 7 +++-- 5 files changed, 61 insertions(+), 27 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 5077a5e534..6d5ea7c231 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -446,12 +446,13 @@ Description * The optional second entry is a profile name or an absolute path to a profile directory * The optional third entry is the keyring to retrieve passwords for decrypting cookies from * The optional fourth entry is a (Firefox) container name (``"none"`` for only cookies with no container) + * The optional fifth entry is the domain to extract cookies for. Prefix it with a dot ``.`` to include cookies for subdomains. Has no effect when also specifying a container. .. code:: json ["firefox"] ["firefox", null, null, "Personal"] - ["chromium", "Private", "kwallet"] + ["chromium", "Private", "kwallet", null, ".twitter.com"] extractor.*.cookies-update diff --git a/docs/options.md b/docs/options.md index 2df9788fe0..4df191d5d1 100644 --- a/docs/options.md +++ b/docs/options.md @@ -19,11 +19,12 @@ --clear-cache MODULE Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything) --cookies FILE File to load additional cookies from - --cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER] + --cookies-from-browser BROWSER[/DOMAIN][+KEYRING][:PROFILE][::CONTAINER] Name of the browser to load cookies from, with - optional keyring name prefixed with '+', profile - prefixed with ':', and container prefixed with - '::' ('none' for no container) + optional domain prefixed with '/', keyring name + prefixed with '+', profile prefixed with ':', + and container prefixed with '::' ('none' for no + container) ## Output Options: -q, --quiet Activate quiet mode diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index a430f131ad..1450e8f2b4 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -70,12 +70,14 @@ def main(): if args.cookies_from_browser: browser, _, profile = args.cookies_from_browser.partition(":") browser, _, keyring = browser.partition("+") + browser, _, domain = browser.partition("/") if profile.startswith(":"): container = profile[1:] profile = None else: profile, _, container = profile.partition("::") - config.set((), "cookies", (browser, profile, keyring, container)) + config.set((), "cookies", ( + browser, profile, keyring, container, domain)) if args.options_pp: config.set((), "postprocessor-options", args.options_pp) for opts in args.options: diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 78e73bfea2..9e6b3a764e 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -34,19 +34,19 @@ def load_cookies(cookiejar, browser_specification): - browser_name, profile, keyring, container = \ + browser_name, profile, keyring, container, domain = \ _parse_browser_specification(*browser_specification) if browser_name == "firefox": - load_cookies_firefox(cookiejar, profile, container) + load_cookies_firefox(cookiejar, profile, container, domain) elif browser_name == "safari": - load_cookies_safari(cookiejar, profile) + load_cookies_safari(cookiejar, profile, domain) elif browser_name in SUPPORTED_BROWSERS_CHROMIUM: - load_cookies_chrome(cookiejar, browser_name, profile, keyring) + load_cookies_chrome(cookiejar, browser_name, profile, keyring, domain) else: raise ValueError("unknown browser '{}'".format(browser_name)) -def load_cookies_firefox(cookiejar, profile=None, container=None): +def load_cookies_firefox(cookiejar, profile=None, container=None, domain=None): path, container_id = _firefox_cookies_database(profile, container) with DatabaseCopy(path) as db: @@ -60,6 +60,13 @@ def load_cookies_firefox(cookiejar, profile=None, container=None): sql += " WHERE originAttributes LIKE ? OR originAttributes LIKE ?" uid = "%userContextId={}".format(container_id) parameters = (uid, uid + "&%") + elif domain: + if domain[0] == ".": + sql += " WHERE host == ? OR host LIKE ?" + parameters = (domain[1:], "%" + domain) + else: + sql += " WHERE host == ? OR host == ?" + parameters = (domain, "." + domain) set_cookie = cookiejar.set_cookie for name, value, domain, path, secure, expires in db.execute( @@ -69,9 +76,10 @@ def load_cookies_firefox(cookiejar, profile=None, container=None): domain, bool(domain), domain.startswith("."), path, bool(path), secure, expires, False, None, None, {}, )) + logger.info("Extracted %s cookies from Firefox", len(cookiejar)) -def load_cookies_safari(cookiejar, profile=None): +def load_cookies_safari(cookiejar, profile=None, domain=None): """Ref.: https://github.com/libyal/dtformats/blob /main/documentation/Safari%20Cookies.asciidoc - This data appears to be out of date @@ -87,7 +95,8 @@ def load_cookies_safari(cookiejar, profile=None): _safari_parse_cookies_page(p.read_bytes(page_size), cookiejar) -def load_cookies_chrome(cookiejar, browser_name, profile, keyring): +def load_cookies_chrome(cookiejar, browser_name, profile=None, + keyring=None, domain=None): config = _get_chromium_based_browser_settings(browser_name) path = _chrome_cookies_database(profile, config) logger.debug("Extracting cookies from %s", path) @@ -95,19 +104,31 @@ def load_cookies_chrome(cookiejar, browser_name, profile, keyring): with DatabaseCopy(path) as db: db.text_factory = bytes decryptor = get_cookie_decryptor( - config["directory"], config["keyring"], keyring=keyring) + config["directory"], config["keyring"], keyring) + + if domain: + if domain[0] == ".": + condition = " WHERE host_key == ? OR host_key LIKE ?" + parameters = (domain[1:], "%" + domain) + else: + condition = " WHERE host_key == ? OR host_key == ?" + parameters = (domain, "." + domain) + else: + condition = "" + parameters = () try: rows = db.execute( "SELECT host_key, name, value, encrypted_value, path, " - "expires_utc, is_secure FROM cookies") + "expires_utc, is_secure FROM cookies" + condition, parameters) except sqlite3.OperationalError: rows = db.execute( "SELECT host_key, name, value, encrypted_value, path, " - "expires_utc, secure FROM cookies") + "expires_utc, secure FROM cookies" + condition, parameters) set_cookie = cookiejar.set_cookie - failed_cookies = unencrypted_cookies = 0 + failed_cookies = 0 + unencrypted_cookies = 0 for domain, name, value, enc_value, path, expires, secure in rows: @@ -136,8 +157,8 @@ def load_cookies_chrome(cookiejar, browser_name, profile, keyring): failed_message = "" logger.info("Extracted %s cookies from %s%s", - len(cookiejar), browser_name, failed_message) - counts = decryptor.cookie_counts.copy() + len(cookiejar), browser_name.capitalize(), failed_message) + counts = decryptor.cookie_counts counts["unencrypted"] = unencrypted_cookies logger.debug("cookie version breakdown: %s", counts) @@ -224,7 +245,7 @@ def _safari_parse_cookies_header(data): return page_sizes, p.cursor -def _safari_parse_cookies_page(data, jar): +def _safari_parse_cookies_page(data, cookiejar, domain=None): p = DataParser(data) p.expect_bytes(b"\x00\x00\x01\x00", "page signature") number_of_cookies = p.read_uint() @@ -238,12 +259,12 @@ def _safari_parse_cookies_page(data, jar): for i, record_offset in enumerate(record_offsets): p.skip_to(record_offset, "space between records") record_length = _safari_parse_cookies_record( - data[record_offset:], jar) + data[record_offset:], cookiejar, domain) p.read_bytes(record_length) p.skip_to_end("space in between pages") -def _safari_parse_cookies_record(data, cookiejar): +def _safari_parse_cookies_record(data, cookiejar, host=None): p = DataParser(data) record_size = p.read_uint() p.skip(4, "unknown record field 1") @@ -262,6 +283,14 @@ def _safari_parse_cookies_record(data, cookiejar): p.skip_to(domain_offset) domain = p.read_cstring() + if host: + if host[0] == ".": + if host[1:] != domain and not domain.endswith(host): + return record_size + else: + if host != domain and ("." + host) != domain: + return record_size + p.skip_to(name_offset) name = p.read_cstring() @@ -978,7 +1007,7 @@ def _is_path(value): def _parse_browser_specification( - browser, profile=None, keyring=None, container=None): + browser, profile=None, keyring=None, container=None, domain=None): browser = browser.lower() if browser not in SUPPORTED_BROWSERS: raise ValueError("unsupported browser '{}'".format(browser)) @@ -986,4 +1015,4 @@ def _parse_browser_specification( raise ValueError("unsupported keyring '{}'".format(keyring)) if profile and _is_path(profile): profile = os.path.expanduser(profile) - return browser, profile, keyring, container + return browser, profile, keyring, container, domain diff --git a/gallery_dl/option.py b/gallery_dl/option.py index aad307f351..6bd6d42979 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -156,9 +156,10 @@ def build_parser(): general.add_argument( "--cookies-from-browser", dest="cookies_from_browser", - metavar="BROWSER[+KEYRING][:PROFILE][::CONTAINER]", - help=("Name of the browser to load cookies from, " - "with optional keyring name prefixed with '+', " + metavar="BROWSER[/DOMAIN][+KEYRING][:PROFILE][::CONTAINER]", + help=("Name of the browser to load cookies from, with optional " + "domain prefixed with '/', " + "keyring name prefixed with '+', " "profile prefixed with ':', and " "container prefixed with '::' ('none' for no container)"), ) From 4c1f3b21602ad3b0b7264b30f49cdb4c20dce26f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 5 May 2023 22:24:41 +0200 Subject: [PATCH 064/252] [cookies] simplify '_mac_absolute_time_to_posix()' hardcode UNIX timestamp of 2001-01-01 --- gallery_dl/cookies.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 9e6b3a764e..32ba323de9 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -20,7 +20,6 @@ import subprocess import sys import tempfile -from datetime import datetime, timedelta, timezone from hashlib import pbkdf2_hmac from http.cookiejar import Cookie from . import aes, text, util @@ -921,8 +920,8 @@ def _get_linux_desktop_environment(env): def _mac_absolute_time_to_posix(timestamp): - return int((datetime(2001, 1, 1, 0, 0, tzinfo=timezone.utc) + - timedelta(seconds=timestamp)).timestamp()) + # 978307200 is timestamp of 2001-01-01 00:00:00 + return 978307200 + int(timestamp) def pbkdf2_sha1(password, salt, iterations, key_length): From 8520de57f06bcc3e31a8c9ad10364d7fac5a29d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 6 May 2023 14:52:43 +0200 Subject: [PATCH 065/252] [imgur] add 'favorite-folder' extractor (#4016) --- docs/supportedsites.md | 2 +- gallery_dl/extractor/imgur.py | 59 ++++++++++++++++++++++++++++++----- scripts/supportedsites.py | 3 ++ 3 files changed, 56 insertions(+), 8 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3d1ac014fe..95d799d0b6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -388,7 +388,7 @@ Consider all sites to be NSFW unless otherwise known. imgur https://imgur.com/ - Albums, Favorites, Galleries, individual Images, Search Results, Subreddits, Tag Searches, User Profiles + Albums, Favorites, Favorites Folders, Galleries, individual Images, Search Results, Subreddits, Tag Searches, User Profiles diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index f8f16006fe..8e22783063 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -285,7 +285,7 @@ def items(self): class ImgurFavoriteExtractor(ImgurExtractor): """Extractor for a user's favorites""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/user/([^/?#]+)/favorites" + pattern = BASE_PATTERN + r"/user/([^/?#]+)/favorites/?$" test = ("https://imgur.com/user/Miguenzo/favorites", { "range": "1-100", "count": 100, @@ -296,6 +296,28 @@ def items(self): return self._items_queue(self.api.account_favorites(self.key)) +class ImgurFavoriteFolderExtractor(ImgurExtractor): + """Extractor for a user's favorites folder""" + subcategory = "favorite-folder" + pattern = BASE_PATTERN + r"/user/([^/?#]+)/favorites/folder/(\d+)" + test = ( + ("https://imgur.com/user/mikf1/favorites/folder/11896757/public", { + "count": 3, + }), + ("https://imgur.com/user/mikf1/favorites/folder/11896741/private", { + "count": 5, + }), + ) + + def __init__(self, match): + ImgurExtractor.__init__(self, match) + self.folder_id = match.group(2) + + def items(self): + return self._items_queue(self.api.account_favorites_folder( + self.key, self.folder_id)) + + class ImgurSubredditExtractor(ImgurExtractor): """Extractor for a subreddits's imgur links""" subcategory = "subreddit" @@ -346,15 +368,18 @@ class ImgurAPI(): """ def __init__(self, extractor): self.extractor = extractor - self.headers = { - "Authorization": "Client-ID " + ( - extractor.config("client-id") or "546c25a59c58ad7"), - } + self.client_id = extractor.config("client-id") or "546c25a59c58ad7" + self.headers = {"Authorization": "Client-ID " + self.client_id} def account_favorites(self, account): endpoint = "/3/account/{}/gallery_favorites".format(account) return self._pagination(endpoint) + def account_favorites_folder(self, account, folder_id): + endpoint = "/3/account/{}/folders/{}/favorites".format( + account, folder_id) + return self._pagination_v2(endpoint) + def gallery_search(self, query): endpoint = "/3/gallery/search" params = {"q": query} @@ -386,12 +411,12 @@ def gallery(self, gallery_hash): endpoint = "/post/v1/posts/" + gallery_hash return self._call(endpoint) - def _call(self, endpoint, params=None): + def _call(self, endpoint, params=None, headers=None): while True: try: return self.extractor.request( "https://api.imgur.com" + endpoint, - params=params, headers=self.headers, + params=params, headers=(headers or self.headers), ).json() except exception.HttpError as exc: if exc.status not in (403, 429) or \ @@ -410,3 +435,23 @@ def _pagination(self, endpoint, params=None, key=None): return yield from data num += 1 + + def _pagination_v2(self, endpoint, params=None, key=None): + if params is None: + params = {} + params["client_id"] = self.client_id + params["page"] = 0 + params["sort"] = "newest" + + headers = { + "Referer": "https://imgur.com/", + "Origin": "https://imgur.com", + } + + while True: + data = self._call(endpoint, params, headers)["data"] + if not data: + return + yield from data + + params["page"] += 1 diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index ebeac1c21a..b25edea6ee 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -183,6 +183,9 @@ "hentaifoundry": { "story": "", }, + "imgur": { + "favorite-folder": "Favorites Folders", + }, "instagram": { "posts": "", "saved": "Saved Posts", From d12dd3813c51cf141bd8b1afad5a98c37379e37c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 6 May 2023 14:58:42 +0200 Subject: [PATCH 066/252] [imgur] fix internal image/album URLs URLs from "link" attributes of newer images/albums were all returned as 'https://imgur.com/gallery/...' instead of the expected format, causing them to be ignored. --- gallery_dl/extractor/imgur.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 8e22783063..4c29d98ff1 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -47,8 +47,13 @@ def _items_queue(self, items): image_ex = ImgurImageExtractor for item in items: - item["_extractor"] = album_ex if item["is_album"] else image_ex - yield Message.Queue, item["link"], item + if item["is_album"]: + url = "https://imgur.com/a/" + item["id"] + item["_extractor"] = album_ex + else: + url = "https://imgur.com/" + item["id"] + item["_extractor"] = image_ex + yield Message.Queue, url, item class ImgurImageExtractor(ImgurExtractor): @@ -272,7 +277,7 @@ class ImgurUserExtractor(ImgurExtractor): ("https://imgur.com/user/Miguenzo", { "range": "1-100", "count": 100, - "pattern": r"https?://(i.imgur.com|imgur.com/a)/[\w.]+", + "pattern": r"https://imgur\.com(/a)?/\w+$", }), ("https://imgur.com/user/Miguenzo/posts"), ("https://imgur.com/user/Miguenzo/submitted"), @@ -289,7 +294,7 @@ class ImgurFavoriteExtractor(ImgurExtractor): test = ("https://imgur.com/user/Miguenzo/favorites", { "range": "1-100", "count": 100, - "pattern": r"https?://(i.imgur.com|imgur.com/a)/[\w.]+", + "pattern": r"https://imgur\.com(/a)?/\w+$", }) def items(self): @@ -302,9 +307,11 @@ class ImgurFavoriteFolderExtractor(ImgurExtractor): pattern = BASE_PATTERN + r"/user/([^/?#]+)/favorites/folder/(\d+)" test = ( ("https://imgur.com/user/mikf1/favorites/folder/11896757/public", { + "pattern": r"https://imgur\.com(/a)?/\w+$", "count": 3, }), ("https://imgur.com/user/mikf1/favorites/folder/11896741/private", { + "pattern": r"https://imgur\.com(/a)?/\w+$", "count": 5, }), ) @@ -325,7 +332,7 @@ class ImgurSubredditExtractor(ImgurExtractor): test = ("https://imgur.com/r/pics", { "range": "1-100", "count": 100, - "pattern": r"https?://(i.imgur.com|imgur.com/a)/[\w.]+", + "pattern": r"https://imgur\.com(/a)?/\w+$", }) def items(self): @@ -339,7 +346,7 @@ class ImgurTagExtractor(ImgurExtractor): test = ("https://imgur.com/t/animals", { "range": "1-100", "count": 100, - "pattern": r"https?://(i.imgur.com|imgur.com/a)/[\w.]+", + "pattern": r"https://imgur\.com(/a)?/\w+$", }) def items(self): @@ -353,7 +360,7 @@ class ImgurSearchExtractor(ImgurExtractor): test = ("https://imgur.com/search?q=cute+cat", { "range": "1-100", "count": 100, - "pattern": r"https?://(i.imgur.com|imgur.com/a)/[\w.]+", + "pattern": r"https://imgur\.com(/a)?/\w+$", }) def items(self): From b9b1cdd71bf3f364c55fffe05187c35615816297 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 6 May 2023 17:37:49 +0200 Subject: [PATCH 067/252] add '--cookies-export' command-line option --- docs/options.md | 1 + gallery_dl/option.py | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/docs/options.md b/docs/options.md index 4df191d5d1..8ab88b423a 100644 --- a/docs/options.md +++ b/docs/options.md @@ -19,6 +19,7 @@ --clear-cache MODULE Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything) --cookies FILE File to load additional cookies from + --cookies-export FILE Export session cookies to FILE --cookies-from-browser BROWSER[/DOMAIN][+KEYRING][:PROFILE][::CONTAINER] Name of the browser to load cookies from, with optional domain prefixed with '/', keyring name diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 6bd6d42979..e954d95c5e 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -153,6 +153,11 @@ def build_parser(): dest="cookies", metavar="FILE", action=ConfigAction, help="File to load additional cookies from", ) + general.add_argument( + "--cookies-export", + dest="cookies-update", metavar="FILE", action=ConfigAction, + help="Export session cookies to FILE", + ) general.add_argument( "--cookies-from-browser", dest="cookies_from_browser", From 285391df4381cf14612977c26b60d4a87e1673ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 6 May 2023 17:40:55 +0200 Subject: [PATCH 068/252] add '-C' as short option for '--cookies' and put cookie options into their own section --- docs/options.md | 18 ++++++++++-------- gallery_dl/option.py | 42 ++++++++++++++++++++++-------------------- 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/docs/options.md b/docs/options.md index 8ab88b423a..b0abcf8534 100644 --- a/docs/options.md +++ b/docs/options.md @@ -18,14 +18,6 @@ --user-agent UA User-Agent request header --clear-cache MODULE Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything) - --cookies FILE File to load additional cookies from - --cookies-export FILE Export session cookies to FILE - --cookies-from-browser BROWSER[/DOMAIN][+KEYRING][:PROFILE][::CONTAINER] - Name of the browser to load cookies from, with - optional domain prefixed with '/', keyring name - prefixed with '+', profile prefixed with ':', - and container prefixed with '::' ('none' for no - container) ## Output Options: -q, --quiet Activate quiet mode @@ -86,6 +78,16 @@ -p, --password PASS Password belonging to the given username --netrc Enable .netrc authentication data +## Cookie Options: + -C, --cookies FILE File to load additional cookies from + --cookies-export FILE Export session cookies to FILE + --cookies-from-browser BROWSER[/DOMAIN][+KEYRING][:PROFILE][::CONTAINER] + Name of the browser to load cookies from, with + optional domain prefixed with '/', keyring name + prefixed with '+', profile prefixed with ':', + and container prefixed with '::' ('none' for no + container) + ## Selection Options: --download-archive FILE Record all downloaded or skipped files in FILE and skip downloading any file already in it diff --git a/gallery_dl/option.py b/gallery_dl/option.py index e954d95c5e..08e6e70105 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -148,26 +148,6 @@ def build_parser(): help="Delete cached login sessions, cookies, etc. for MODULE " "(ALL to delete everything)", ) - general.add_argument( - "--cookies", - dest="cookies", metavar="FILE", action=ConfigAction, - help="File to load additional cookies from", - ) - general.add_argument( - "--cookies-export", - dest="cookies-update", metavar="FILE", action=ConfigAction, - help="Export session cookies to FILE", - ) - general.add_argument( - "--cookies-from-browser", - dest="cookies_from_browser", - metavar="BROWSER[/DOMAIN][+KEYRING][:PROFILE][::CONTAINER]", - help=("Name of the browser to load cookies from, with optional " - "domain prefixed with '/', " - "keyring name prefixed with '+', " - "profile prefixed with ':', and " - "container prefixed with '::' ('none' for no container)"), - ) output = parser.add_argument_group("Output Options") output.add_argument( @@ -380,6 +360,28 @@ def build_parser(): help="Enable .netrc authentication data", ) + cookies = parser.add_argument_group("Cookie Options") + cookies.add_argument( + "-C", "--cookies", + dest="cookies", metavar="FILE", action=ConfigAction, + help="File to load additional cookies from", + ) + cookies.add_argument( + "--cookies-export", + dest="cookies-update", metavar="FILE", action=ConfigAction, + help="Export session cookies to FILE", + ) + cookies.add_argument( + "--cookies-from-browser", + dest="cookies_from_browser", + metavar="BROWSER[/DOMAIN][+KEYRING][:PROFILE][::CONTAINER]", + help=("Name of the browser to load cookies from, with optional " + "domain prefixed with '/', " + "keyring name prefixed with '+', " + "profile prefixed with ':', and " + "container prefixed with '::' ('none' for no container)"), + ) + selection = parser.add_argument_group("Selection Options") selection.add_argument( "--download-archive", From 1406f7125f2e010b6b5409d4b12a2602af19289f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 6 May 2023 20:41:43 +0200 Subject: [PATCH 069/252] [4chanarchives] add 'thread' and 'board' extractors (#4012) --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/4chanarchives.py | 136 ++++++++++++++++++++++++++ gallery_dl/extractor/__init__.py | 1 + 3 files changed, 143 insertions(+) create mode 100644 gallery_dl/extractor/4chanarchives.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 95d799d0b6..d951da9051 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -37,6 +37,12 @@ Consider all sites to be NSFW unless otherwise known. Boards, Threads + + 4chanarchives + https://4chanarchives.com/ + Boards, Threads + + 500px https://500px.com/ diff --git a/gallery_dl/extractor/4chanarchives.py b/gallery_dl/extractor/4chanarchives.py new file mode 100644 index 0000000000..e5ccd256c6 --- /dev/null +++ b/gallery_dl/extractor/4chanarchives.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://4chanarchives.com/""" + +from .common import Extractor, Message +from .. import text + + +class _4chanarchivesThreadExtractor(Extractor): + """Extractor for threads on 4chanarchives.com""" + category = "4chanarchives" + subcategory = "thread" + root = "https://4chanarchives.com" + directory_fmt = ("{category}", "{board}", "{thread} - {title}") + filename_fmt = "{no}-{filename}.{extension}" + archive_fmt = "{board}_{thread}_{no}" + pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)/thread/(\d+)" + test = ( + ("https://4chanarchives.com/board/c/thread/2707110", { + "pattern": r"https://i\.imgur\.com/(0wLGseE|qbByWDc)\.jpg", + "count": 2, + "keyword": { + "board": "c", + "com": str, + "name": "Anonymous", + "no": int, + "thread": "2707110", + "time": r"re:2016-07-1\d \d\d:\d\d:\d\d", + "title": "Ren Kagami from 'Oyako Neburi'", + }, + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/board/{}/thread/{}".format( + self.root, self.board, self.thread) + page = self.request(url).text + data = self.metadata(page) + posts = self.posts(page) + + if not data["title"]: + data["title"] = text.unescape(text.remove_html( + posts[0]["com"]))[:50] + + for post in posts: + post.update(data) + yield Message.Directory, post + if "url" in post: + yield Message.Url, post["url"], post + + def metadata(self, page): + return { + "board" : self.board, + "thread" : self.thread, + "title" : text.unescape(text.extr( + page, 'property="og:title" content="', '"')), + } + + def posts(self, page): + """Build a list of all post objects""" + return [self.parse(html) for html in text.extract_iter( + page, 'id="pc', '')] + + def parse(self, html): + """Build post object by extracting data from an HTML post""" + post = self._extract_post(html) + if ">File: <" in html: + self._extract_file(html, post) + post["extension"] = post["url"].rpartition(".")[2] + return post + + @staticmethod + def _extract_post(html): + extr = text.extract_from(html) + return { + "no" : text.parse_int(extr('', '"')), + "name": extr('class="name">', '<'), + "time": extr('class="dateTime postNum" >', '<').rstrip(), + "com" : text.unescape( + html[html.find('")[2]), + } + + @staticmethod + def _extract_file(html, post): + extr = text.extract_from(html, html.index(">File: <")) + post["url"] = extr('href="', '"') + post["filename"] = text.unquote(extr(">", "<").rpartition(".")[0]) + post["fsize"] = extr("(", ", ") + post["w"] = text.parse_int(extr("", "x")) + post["h"] = text.parse_int(extr("", ")")) + + +class _4chanarchivesBoardExtractor(Extractor): + """Extractor for boards on 4chanarchives.com""" + category = "4chanarchives" + subcategory = "board" + root = "https://4chanarchives.com" + pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)(?:/(\d+))?/?$" + test = ( + ("https://4chanarchives.com/board/c/", { + "pattern": _4chanarchivesThreadExtractor.pattern, + "range": "1-40", + "count": 40, + }), + ("https://4chanarchives.com/board/c"), + ("https://4chanarchives.com/board/c/10"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.page = match.groups() + + def items(self): + data = {"_extractor": _4chanarchivesThreadExtractor} + pnum = text.parse_int(self.page, 1) + needle = ''' + Date: Sat, 6 May 2023 20:52:45 +0200 Subject: [PATCH 070/252] [4chanarchives] add end condition for 'board' extractor (#4012) --- gallery_dl/extractor/4chanarchives.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gallery_dl/extractor/4chanarchives.py b/gallery_dl/extractor/4chanarchives.py index e5ccd256c6..041e6a3564 100644 --- a/gallery_dl/extractor/4chanarchives.py +++ b/gallery_dl/extractor/4chanarchives.py @@ -130,7 +130,10 @@ def items(self): url = "{}/board/{}/{}".format(self.root, self.board, pnum) page = self.request(url).text + thread = None for thread in text.extract_iter(page, needle, '"'): yield Message.Queue, thread, data + if thread is None: + return pnum += 1 From be0fa94b2e691c5e92d59dd4d1979d3341731490 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 7 May 2023 14:36:07 +0200 Subject: [PATCH 071/252] [imagechest] load all images when a 'Load More' button is present (#4028) --- gallery_dl/extractor/imagechest.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py index 8b18d5e6d6..086b95d893 100644 --- a/gallery_dl/extractor/imagechest.py +++ b/gallery_dl/extractor/imagechest.py @@ -31,6 +31,12 @@ class ImagechestGalleryExtractor(GalleryExtractor): "content": "076959e65be30249a2c651fbe6090dc30ba85193", "count": 3 }), + # "Load More Files" button (#4028) + ("https://imgchest.com/p/9p4n3q2z7nq", { + "pattern": r"https://cdn\.imgchest\.com/files/\w+\.(jpg|png)", + "url": "f5674e8ba79d336193c9f698708d9dcc10e78cc7", + "count": 52, + }), ) def __init__(self, match): @@ -49,6 +55,18 @@ def metadata(self, page): } def images(self, page): + if " More Files" in page: + url = "{}/p/{}/loadAll".format(self.root, self.gallery_id) + headers = { + "X-Requested-With": "XMLHttpRequest", + "Origin" : self.root, + "Referer" : self.gallery_url, + } + csrf_token = text.extr(page, 'name="csrf-token" content="', '"') + data = {"_token": csrf_token} + page += self.request( + url, method="POST", headers=headers, data=data).text + return [ (url, None) for url in text.extract_iter(page, 'data-url="', '"') From 13dedae09f768ca3be66a5d1119a558fe7964119 Mon Sep 17 00:00:00 2001 From: Janne Alaranta Date: Sun, 7 May 2023 18:35:02 +0300 Subject: [PATCH 072/252] add status and tags info to mangadex extractor --- gallery_dl/extractor/mangadex.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 409483b4f2..0539081125 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -83,8 +83,12 @@ def _transform(self, chapter): data["author"] = [author["attributes"]["name"] for author in relationships["author"]] data["group"] = [group["attributes"]["name"] - for group in relationships["scanlation_group"]] + for group in relationships["scanlation_group"]] + data["status"] = mattributes["status"] + data["tags"] = [tag["attributes"]["name"]["en"] + for tag in mattributes["tags"]] + return data From 1ce5dc9e18cff46077db59b38f6f35f55e91a625 Mon Sep 17 00:00:00 2001 From: Janne Alaranta Date: Sun, 7 May 2023 18:47:04 +0300 Subject: [PATCH 073/252] fix whitespaces --- gallery_dl/extractor/mangadex.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 0539081125..6121cf8c78 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -83,12 +83,12 @@ def _transform(self, chapter): data["author"] = [author["attributes"]["name"] for author in relationships["author"]] data["group"] = [group["attributes"]["name"] - for group in relationships["scanlation_group"]] + for group in relationships["scanlation_group"]] data["status"] = mattributes["status"] data["tags"] = [tag["attributes"]["name"]["en"] for tag in mattributes["tags"]] - + return data From 2266fc8cc5b6bfcfa46b1f9c7150b94a062fa29f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 7 May 2023 20:14:49 +0200 Subject: [PATCH 074/252] [mangadex] update and extend test results --- gallery_dl/extractor/mangadex.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 6121cf8c78..12b8f39235 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -98,13 +98,13 @@ class MangadexChapterExtractor(MangadexExtractor): pattern = BASE_PATTERN + r"/chapter/([0-9a-f-]+)" test = ( ("https://mangadex.org/chapter/f946ac53-0b71-4b5d-aeb2-7931b13c4aaa", { - "keyword": "86fb262cf767dac6d965cd904ad499adba466404", + "keyword": "e86128a79ebe7201b648f1caa828496a2878dc8f", # "content": "50383a4c15124682057b197d40261641a98db514", }), # oneshot ("https://mangadex.org/chapter/61a88817-9c29-4281-bdf1-77b3c1be9831", { "count": 64, - "keyword": "6abcbe1e24eeb1049dc931958853cd767ee483fb", + "keyword": "d11ed057a919854696853362be35fc0ba7dded4c", }), # MANGA Plus (#1154) ("https://mangadex.org/chapter/74149a55-e7c4-44ea-8a37-98e879c1096f", { @@ -161,6 +161,9 @@ class MangadexMangaExtractor(MangadexExtractor): "language": str, "artist" : ["Arakawa Hiromu"], "author" : ["Arakawa Hiromu"], + "status" : "completed", + "tags" : ["Oneshot", "Historical", "Action", + "Martial Arts", "Drama", "Tragedy"], }, }), ("https://mangadex.cc/manga/d0c88e3b-ea64-4e07-9841-c1d2ac982f4a/", { From 306e13a4d48eadcb21b08643835690b6c628c39f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 7 May 2023 20:57:36 +0200 Subject: [PATCH 075/252] release version 1.25.4 --- CHANGELOG.md | 23 +++++++++++++++++++++++ README.rst | 4 ++-- gallery_dl/version.py | 2 +- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a76a0dd391..85c732dd60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,28 @@ # Changelog +## 1.25.4 - 2023-05-07 +### Additions +- [4chanarchives] add `thread` and `board` extractors ([#4012](https://github.com/mikf/gallery-dl/issues/4012)) +- [foolfuuka] add `archive.palanq.win` +- [imgur] add `favorite-folder` extractor ([#4016](https://github.com/mikf/gallery-dl/issues/4016)) +- [mangadex] add `status` and `tags` metadata ([#4031](https://github.com/mikf/gallery-dl/issues/4031)) +- allow selecting a domain with `--cookies-from-browser` +- add `--cookies-export` command-line option +- add `-C` as short option for `--cookies` +- include exception type in config error messages +### Fixes +- [exhentai] update sadpanda check +- [imagechest] load all images when a "Load More" button is present ([#4028](https://github.com/mikf/gallery-dl/issues/4028)) +- [imgur] fix bug causing some images/albums from user profiles and favorites to be ignored +- [pinterest] update endpoint for related board pins +- [pinterest] fix `pin.it` extractor +- [ytdl] fix yt-dlp `--xff/--geo-bypass` tests ([#3989](https://github.com/mikf/gallery-dl/issues/3989)) +### Removals +- [420chan] remove module +- [foolfuuka] remove `archive.alice.al` and `tokyochronos.net` +- [foolslide] remove `sensescans.com` +- [nana] remove module + ## 1.25.3 - 2023-04-30 ### Additions - [imagefap] extract `description` and `categories` metadata ([#3905](https://github.com/mikf/gallery-dl/issues/3905)) diff --git a/README.rst b/README.rst index 1f4b692680..36f3ffae72 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 39cfbd1c5d..4f9e49a280 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.0-dev" +__version__ = "1.25.4" From 708f478d158d4471a7c2ba3551b4f144a7f9c138 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 11 May 2023 15:59:42 +0200 Subject: [PATCH 076/252] [danbooru][e621] add 'date' metadata field (#4047) --- gallery_dl/extractor/danbooru.py | 5 ++++- gallery_dl/extractor/e621.py | 3 +++ gallery_dl/version.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 326b53b405..ab23520a34 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -70,6 +70,8 @@ def items(self): continue text.nameext_from_url(url, post) + post["date"] = text.parse_datetime( + post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") if post["extension"] == "zip": if self.ugoira: @@ -181,7 +183,7 @@ class DanbooruTagExtractor(DanbooruExtractor): "count": 12, }), ("https://aibooru.online/posts?tags=center_frills&z=1", { - "pattern": r"https://aibooru\.online/data/original" + "pattern": r"https://cdn\.aibooru\.online/original" r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.\w+", "count": ">= 3", }), @@ -245,6 +247,7 @@ class DanbooruPostExtractor(DanbooruExtractor): test = ( ("https://danbooru.donmai.us/posts/294929", { "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", + "keyword": {"date": "dt:2008-08-12 04:46:05"}, }), ("https://danbooru.donmai.us/posts/3613024", { "pattern": r"https?://.+\.zip$", diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index 8f2994e429..65ef1e1c1a 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -57,6 +57,8 @@ def items(self): post["filename"] = file["md5"] post["extension"] = file["ext"] + post["date"] = text.parse_datetime( + post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") post.update(data) yield Message.Directory, post @@ -140,6 +142,7 @@ class E621PostExtractor(E621Extractor, danbooru.DanbooruPostExtractor): ("https://e621.net/posts/535", { "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529", "content": "66f46e96a893fba8e694c4e049b23c2acc9af462", + "keyword": {"date": "dt:2007-02-17 19:02:32"}, }), ("https://e621.net/posts/3181052", { "options": (("metadata", "notes,pools"),), diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 4f9e49a280..8d4c98a5f3 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.4" +__version__ = "1.25.5-dev" From c698c3de446c8ec61e8e9ea97439f265d14ade0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 11 May 2023 16:04:37 +0200 Subject: [PATCH 077/252] [newgrounds] add default delay between requests (#4046) --- gallery_dl/extractor/newgrounds.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 2b759ec07c..5d100a49f4 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -23,6 +23,7 @@ class NewgroundsExtractor(Extractor): root = "https://www.newgrounds.com" cookiedomain = ".newgrounds.com" cookienames = ("NG_GG_username", "vmk1du5I8m") + request_interval = 1.0 def __init__(self, match): Extractor.__init__(self, match) From 20dc13f8328673b0b9b0691d397e0bdb470a0472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 12 May 2023 16:01:19 +0200 Subject: [PATCH 078/252] [pixiv] initial 'novel' support (#1241, #4044) supported URLs are - https://www.pixiv.net/novel/show.php?id= - https://www.pixiv.net/novel/series/ - https://www.pixiv.net/en/users//novels --- docs/supportedsites.md | 2 +- gallery_dl/extractor/pixiv.py | 141 ++++++++++++++++++++++++++++++++++ scripts/supportedsites.py | 2 + 3 files changed, 144 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d951da9051..f98508ca67 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -646,7 +646,7 @@ Consider all sites to be NSFW unless otherwise known. Pixiv https://www.pixiv.net/ - Artworks, Avatars, Backgrounds, Favorites, Follows, pixiv.me Links, pixivision, Rankings, Search Results, Series, Sketch, User Profiles, individual Images + Artworks, Avatars, Backgrounds, Favorites, Follows, pixiv.me Links, Novels, Novel Series, pixivision, Rankings, Search Results, Series, Sketch, User Profiles, individual Images OAuth diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index b70403116e..2b5a62a6ad 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -172,6 +172,7 @@ def items(self): (PixivBackgroundExtractor, base + "background"), (PixivArtworksExtractor , base + "artworks"), (PixivFavoriteExtractor , base + "bookmarks/artworks"), + (PixivNovelUserExtractor , base + "novels"), ), ("artworks",)) @@ -750,6 +751,125 @@ def works(self): params["p"] += 1 +class PixivNovelExtractor(PixivExtractor): + """Extractor for pixiv novels""" + subcategory = "novel" + request_interval = 1.0 + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" + r"/novel/show\.php\?id=(\d+)") + test = ("https://www.pixiv.net/novel/show.php?id=19612040", { + "count": 1, + "content": "c6f22167f9df7aeaf63b51933b4c8ef6fc5e6a1e", + "keyword": { + "caption": r"re:「無能な名無し」と呼ばれ虐げられて育った鈴\(すず\)は、", + "comment_access_control": 0, + "create_date": "2023-04-02T15:18:58+09:00", + "date": "dt:2023-04-02 06:18:58", + "id": 19612040, + "is_bookmarked": False, + "is_muted": False, + "is_mypixiv_only": False, + "is_original": True, + "is_x_restricted": False, + "novel_ai_type": 1, + "page_count": 1, + "rating": "General", + "restrict": 0, + "series": { + "id": 10278364, + "title": "龍の贄嫁〜虐げられた少女は運命の番として愛される〜" + }, + "tags": ["和風ファンタジー", "溺愛", "神様", "ヤンデレ", "執着", + "異能", "ざまぁ", "学園", "神嫁"], + "text_length": 5977, + "title": "異母妹から「無能な名無し」と虐げられていた私、" + "どうやら異母妹に霊力を搾取されていたようです(1)", + "user": { + "account": "yukinaga_chifuyu", + "id": 77055466, + }, + "visible": True, + "x_restrict": 0, + }, + }) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.novel_id = match.group(1) + + def items(self): + tags = self.config("tags", "japanese") + if tags == "original": + transform_tags = None + elif tags == "translated": + def transform_tags(work): + work["tags"] = list(dict.fromkeys( + tag["translated_name"] or tag["name"] + for tag in work["tags"])) + else: + def transform_tags(work): + work["tags"] = [tag["name"] for tag in work["tags"]] + + ratings = {0: "General", 1: "R-18", 2: "R-18G"} + meta_user = self.config("metadata") + meta_bookmark = self.config("metadata-bookmark") + + novels = self.novels() + if self.max_posts: + novels = itertools.islice(novels, self.max_posts) + for novel in novels: + if meta_user: + novel.update(self.api.user_detail(novel["user"]["id"])) + if meta_bookmark and novel["is_bookmarked"]: + detail = self.api.novel_bookmark_detail(novel["id"]) + novel["tags_bookmark"] = [tag["name"] for tag in detail["tags"] + if tag["is_registered"]] + if transform_tags: + transform_tags(novel) + novel["num"] = 0 + novel["date"] = text.parse_datetime(novel["create_date"]) + novel["rating"] = ratings.get(novel["x_restrict"]) + novel["suffix"] = "" + + yield Message.Directory, novel + + novel["extension"] = "txt" + content = self.api.novel_text(novel["id"])["novel_text"] + yield Message.Url, "text:" + content, novel + + def novels(self): + return (self.api.novel_detail(self.novel_id),) + + +class PixivNovelUserExtractor(PixivNovelExtractor): + """Extractor for pixiv users' novels""" + subcategory = "novel-user" + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" + r"/(?:en/)?users/(\d+)/novels") + test = ("https://www.pixiv.net/en/users/77055466/novels", { + "pattern": "^text:", + "range": "1-5", + "count": 5, + }) + + def novels(self): + return self.api.user_novels(self.novel_id) + + +class PixivNovelSeriesExtractor(PixivNovelExtractor): + """Extractor for pixiv novel series""" + subcategory = "novel-series" + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" + r"/novel/series/(\d+)") + test = ("https://www.pixiv.net/novel/series/10278364", { + "count": 4, + "content": "b06abed001b3f6ccfb1579699e9a238b46d38ea2", + }) + + def novels(self): + return self.api.novel_series(self.novel_id) + + class PixivSketchExtractor(Extractor): """Extractor for user pages on sketch.pixiv.net""" category = "pixiv" @@ -907,6 +1027,23 @@ def illust_related(self, illust_id): params = {"illust_id": illust_id} return self._pagination("/v2/illust/related", params) + def novel_bookmark_detail(self, novel_id): + params = {"novel_id": novel_id} + return self._call( + "/v2/novel/bookmark/detail", params)["bookmark_detail"] + + def novel_detail(self, novel_id): + params = {"novel_id": novel_id} + return self._call("/v2/novel/detail", params)["novel"] + + def novel_series(self, series_id): + params = {"series_id": series_id} + return self._pagination("/v1/novel/series", params, "novels") + + def novel_text(self, novel_id): + params = {"novel_id": novel_id} + return self._call("/v1/novel/text", params) + def search_illust(self, word, sort=None, target=None, duration=None, date_start=None, date_end=None): params = {"word": word, "search_target": target, @@ -938,6 +1075,10 @@ def user_illusts(self, user_id): params = {"user_id": user_id} return self._pagination("/v1/user/illusts", params) + def user_novels(self, user_id): + params = {"user_id": user_id} + return self._pagination("/v1/user/novels", params, "novels") + def ugoira_metadata(self, illust_id): params = {"illust_id": illust_id} return self._call("/v1/ugoira/metadata", params)["ugoira_metadata"] diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index b25edea6ee..e38771a1df 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -213,6 +213,8 @@ }, "pixiv": { "me" : "pixiv.me Links", + "novel-series": "Novel Series", + "novel-user": "", "pixivision": "pixivision", "sketch": "Sketch", "work": "individual Images", From 2e6cea95db6089d31d411dc9f63044ccd6101a5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 12 May 2023 16:13:25 +0200 Subject: [PATCH 079/252] [cookies] update logging behavior (#4050) - only show the same warning/error once - simplify and capitalize logging messages --- gallery_dl/cookies.py | 147 ++++++++++++++++++++++-------------------- 1 file changed, 78 insertions(+), 69 deletions(-) diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 32ba323de9..c5c5667bb2 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -75,7 +75,7 @@ def load_cookies_firefox(cookiejar, profile=None, container=None, domain=None): domain, bool(domain), domain.startswith("."), path, bool(path), secure, expires, False, None, None, {}, )) - logger.info("Extracted %s cookies from Firefox", len(cookiejar)) + _log_info("Extracted %s cookies from Firefox", len(cookiejar)) def load_cookies_safari(cookiejar, profile=None, domain=None): @@ -98,7 +98,7 @@ def load_cookies_chrome(cookiejar, browser_name, profile=None, keyring=None, domain=None): config = _get_chromium_based_browser_settings(browser_name) path = _chrome_cookies_database(profile, config) - logger.debug("Extracting cookies from %s", path) + _log_debug("Extracting cookies from %s", path) with DatabaseCopy(path) as db: db.text_factory = bytes @@ -155,11 +155,11 @@ def load_cookies_chrome(cookiejar, browser_name, profile=None, else: failed_message = "" - logger.info("Extracted %s cookies from %s%s", - len(cookiejar), browser_name.capitalize(), failed_message) + _log_info("Extracted %s cookies from %s%s", + len(cookiejar), browser_name.capitalize(), failed_message) counts = decryptor.cookie_counts counts["unencrypted"] = unencrypted_cookies - logger.debug("cookie version breakdown: %s", counts) + _log_debug("Cookie version breakdown: %s", counts) # -------------------------------------------------------------------- @@ -177,11 +177,11 @@ def _firefox_cookies_database(profile=None, container=None): if path is None: raise FileNotFoundError("Unable to find Firefox cookies database in " "{}".format(search_root)) - logger.debug("Extracting cookies from %s", path) + _log_debug("Extracting cookies from %s", path) if container == "none": container_id = False - logger.debug("Only loading cookies not belonging to any container") + _log_debug("Only loading cookies not belonging to any container") elif container: containers_path = os.path.join( @@ -191,8 +191,8 @@ def _firefox_cookies_database(profile=None, container=None): with open(containers_path) as file: identities = util.json_loads(file.read())["identities"] except OSError: - logger.error("Unable to read Firefox container database at %s", - containers_path) + _log_error("Unable to read Firefox container database at '%s'", + containers_path) raise except KeyError: identities = () @@ -203,10 +203,10 @@ def _firefox_cookies_database(profile=None, container=None): container_id = context["userContextId"] break else: - raise ValueError("Unable to find Firefox container {}".format( + raise ValueError("Unable to find Firefox container '{}'".format( container)) - logger.debug("Only loading cookies from container '%s' (ID %s)", - container, container_id) + _log_debug("Only loading cookies from container '%s' (ID %s)", + container, container_id) else: container_id = None @@ -229,7 +229,7 @@ def _safari_cookies_database(): path = os.path.expanduser("~/Library/Cookies/Cookies.binarycookies") return open(path, "rb") except FileNotFoundError: - logger.debug("Trying secondary cookie location") + _log_debug("Trying secondary cookie location") path = os.path.expanduser("~/Library/Containers/com.apple.Safari/Data" "/Library/Cookies/Cookies.binarycookies") return open(path, "rb") @@ -250,7 +250,7 @@ def _safari_parse_cookies_page(data, cookiejar, domain=None): number_of_cookies = p.read_uint() record_offsets = [p.read_uint() for _ in range(number_of_cookies)] if number_of_cookies == 0: - logger.debug("a cookies page of size %s has no cookies", len(data)) + _log_debug("Cookies page of size %s has no cookies", len(data)) return p.skip_to(record_offsets[0], "unknown page header field") @@ -299,8 +299,7 @@ def _safari_parse_cookies_record(data, cookiejar, host=None): p.skip_to(value_offset) value = p.read_cstring() except UnicodeDecodeError: - logger.warning("failed to parse Safari cookie " - "because UTF-8 decoding failed") + _log_warning("Failed to parse Safari cookie") return record_size p.skip_to(record_size, "space at the end of the record") @@ -328,7 +327,7 @@ def _chrome_cookies_database(profile, config): elif config["profiles"]: search_root = os.path.join(config["directory"], profile) else: - logger.warning("%s does not support profiles", config["browser"]) + _log_warning("%s does not support profiles", config["browser"]) search_root = config["directory"] path = _find_most_recently_used_file(search_root, "Cookies") @@ -479,7 +478,7 @@ def decrypt(self, encrypted_value): elif version == b"v11": self._cookie_counts["v11"] += 1 if self._v11_key is None: - logger.warning("cannot decrypt v11 cookies: no key found") + _log_warning("Unable to decrypt v11 cookies: no key found") return None return _decrypt_aes_cbc(ciphertext, self._v11_key) @@ -513,7 +512,7 @@ def decrypt(self, encrypted_value): if version == b"v10": self._cookie_counts["v10"] += 1 if self._v10_key is None: - logger.warning("cannot decrypt v10 cookies: no key found") + _log_warning("Unable to decrypt v10 cookies: no key found") return None return _decrypt_aes_cbc(ciphertext, self._v10_key) @@ -543,7 +542,7 @@ def decrypt(self, encrypted_value): if version == b"v10": self._cookie_counts["v10"] += 1 if self._v10_key is None: - logger.warning("cannot decrypt v10 cookies: no key found") + _log_warning("Unable to decrypt v10 cookies: no key found") return None # https://chromium.googlesource.com/chromium/src/+/refs/heads @@ -581,7 +580,7 @@ def _choose_linux_keyring(): SelectBackend """ desktop_environment = _get_linux_desktop_environment(os.environ) - logger.debug("Detected desktop environment: %s", desktop_environment) + _log_debug("Detected desktop environment: %s", desktop_environment) if desktop_environment == DE_KDE: return KEYRING_KWALLET if desktop_environment == DE_OTHER: @@ -609,23 +608,23 @@ def _get_kwallet_network_wallet(): ) if proc.returncode != 0: - logger.warning("failed to read NetworkWallet") + _log_warning("Failed to read NetworkWallet") return default_wallet else: network_wallet = stdout.decode().strip() - logger.debug("NetworkWallet = '%s'", network_wallet) + _log_debug("NetworkWallet = '%s'", network_wallet) return network_wallet except Exception as exc: - logger.warning("exception while obtaining NetworkWallet (%s: %s)", - exc.__class__.__name__, exc) + _log_warning("Error while obtaining NetworkWallet (%s: %s)", + exc.__class__.__name__, exc) return default_wallet def _get_kwallet_password(browser_keyring_name): - logger.debug("using kwallet-query to obtain password from kwallet") + _log_debug("Using kwallet-query to obtain password from kwallet") if shutil.which("kwallet-query") is None: - logger.error( + _log_error( "kwallet-query command not found. KWallet and kwallet-query " "must be installed to read from KWallet. kwallet-query should be " "included in the kwallet package for your distribution") @@ -642,14 +641,14 @@ def _get_kwallet_password(browser_keyring_name): ) if proc.returncode != 0: - logger.error("kwallet-query failed with return code {}. " - "Please consult the kwallet-query man page " - "for details".format(proc.returncode)) + _log_error("kwallet-query failed with return code {}. " + "Please consult the kwallet-query man page " + "for details".format(proc.returncode)) return b"" if stdout.lower().startswith(b"failed to read"): - logger.debug("Failed to read password from kwallet. " - "Using empty string instead") + _log_debug("Failed to read password from kwallet. " + "Using empty string instead") # This sometimes occurs in KDE because chrome does not check # hasEntry and instead just tries to read the value (which # kwallet returns "") whereas kwallet-query checks hasEntry. @@ -660,13 +659,12 @@ def _get_kwallet_password(browser_keyring_name): # random password and store it, but that doesn't matter here. return b"" else: - logger.debug("password found") if stdout[-1:] == b"\n": stdout = stdout[:-1] return stdout except Exception as exc: - logger.warning("exception running kwallet-query (%s: %s)", - exc.__class__.__name__, exc) + _log_warning("Error when running kwallet-query (%s: %s)", + exc.__class__.__name__, exc) return b"" @@ -674,7 +672,7 @@ def _get_gnome_keyring_password(browser_keyring_name): try: import secretstorage except ImportError: - logger.error("secretstorage not available") + _log_error("'secretstorage' Python package not available") return b"" # Gnome keyring does not seem to organise keys in the same way as KWallet, @@ -689,7 +687,7 @@ def _get_gnome_keyring_password(browser_keyring_name): if item.get_label() == label: return item.get_secret() else: - logger.error("failed to read from keyring") + _log_error("Failed to read from GNOME keyring") return b"" @@ -703,7 +701,7 @@ def _get_linux_keyring_password(browser_keyring_name, keyring): if not keyring: keyring = _choose_linux_keyring() - logger.debug("Chosen keyring: %s", keyring) + _log_debug("Chosen keyring: %s", keyring) if keyring == KEYRING_KWALLET: return _get_kwallet_password(browser_keyring_name) @@ -717,8 +715,8 @@ def _get_linux_keyring_password(browser_keyring_name, keyring): def _get_mac_keyring_password(browser_keyring_name): - logger.debug("using find-generic-password to obtain " - "password from OSX keychain") + _log_debug("Using find-generic-password to obtain " + "password from OSX keychain") try: proc, stdout = Popen_communicate( "security", "find-generic-password", @@ -731,28 +729,28 @@ def _get_mac_keyring_password(browser_keyring_name): stdout = stdout[:-1] return stdout except Exception as exc: - logger.warning("exception running find-generic-password (%s: %s)", - exc.__class__.__name__, exc) + _log_warning("Error when using find-generic-password (%s: %s)", + exc.__class__.__name__, exc) return None def _get_windows_v10_key(browser_root): path = _find_most_recently_used_file(browser_root, "Local State") if path is None: - logger.error("could not find local state file") + _log_error("Unable to find Local State file") return None - logger.debug("Found local state file at '%s'", path) + _log_debug("Found Local State file at '%s'", path) with open(path, encoding="utf-8") as file: data = util.json_loads(file.read()) try: base64_key = data["os_crypt"]["encrypted_key"] except KeyError: - logger.error("no encrypted key in Local State") + _log_error("Unable to find encrypted key in Local State") return None encrypted_key = binascii.a2b_base64(base64_key) prefix = b"DPAPI" if not encrypted_key.startswith(prefix): - logger.error("invalid key") + _log_error("Invalid Local State key") return None return _decrypt_windows_dpapi(encrypted_key[len(prefix):]) @@ -804,10 +802,10 @@ def read_cstring(self): def skip(self, num_bytes, description="unknown"): if num_bytes > 0: - logger.debug("skipping {} bytes ({}): {!r}".format( + _log_debug("Skipping {} bytes ({}): {!r}".format( num_bytes, description, self.read_bytes(num_bytes))) elif num_bytes < 0: - raise ParserError("invalid skip of {} bytes".format(num_bytes)) + raise ParserError("Invalid skip of {} bytes".format(num_bytes)) def skip_to(self, offset, description="unknown"): self.skip(offset - self.cursor, description) @@ -929,31 +927,25 @@ def pbkdf2_sha1(password, salt, iterations, key_length): def _decrypt_aes_cbc(ciphertext, key, initialization_vector=b" " * 16): - plaintext = aes.unpad_pkcs7( - aes.aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector)) try: - return plaintext.decode() + return aes.unpad_pkcs7(aes.aes_cbc_decrypt_bytes( + ciphertext, key, initialization_vector)).decode() except UnicodeDecodeError: - logger.warning("failed to decrypt cookie (AES-CBC) because UTF-8 " - "decoding failed. Possibly the key is wrong?") - return None + _log_warning("Failed to decrypt cookie (AES-CBC Unicode)") + except ValueError: + _log_warning("Failed to decrypt cookie (AES-CBC)") + return None def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag): try: - plaintext = aes.aes_gcm_decrypt_and_verify_bytes( - ciphertext, key, authentication_tag, nonce) - except ValueError: - logger.warning("failed to decrypt cookie (AES-GCM) because MAC check " - "failed. Possibly the key is wrong?") - return None - - try: - return plaintext.decode() + return aes.aes_gcm_decrypt_and_verify_bytes( + ciphertext, key, authentication_tag, nonce).decode() except UnicodeDecodeError: - logger.warning("failed to decrypt cookie (AES-GCM) because UTF-8 " - "decoding failed. Possibly the key is wrong?") - return None + _log_warning("Failed to decrypt cookie (AES-GCM Unicode)") + except ValueError: + _log_warning("Failed to decrypt cookie (AES-GCM MAC)") + return None def _decrypt_windows_dpapi(ciphertext): @@ -981,7 +973,7 @@ class DATA_BLOB(ctypes.Structure): ctypes.byref(blob_out) # pDataOut ) if not ret: - logger.warning("failed to decrypt with DPAPI") + _log_warning("Failed to decrypt cookie (DPAPI)") return None result = ctypes.string_at(blob_out.pbData, blob_out.cbData) @@ -1009,9 +1001,26 @@ def _parse_browser_specification( browser, profile=None, keyring=None, container=None, domain=None): browser = browser.lower() if browser not in SUPPORTED_BROWSERS: - raise ValueError("unsupported browser '{}'".format(browser)) + raise ValueError("Unsupported browser '{}'".format(browser)) if keyring and keyring not in SUPPORTED_KEYRINGS: - raise ValueError("unsupported keyring '{}'".format(keyring)) + raise ValueError("Unsupported keyring '{}'".format(keyring)) if profile and _is_path(profile): profile = os.path.expanduser(profile) return browser, profile, keyring, container, domain + + +_log_cache = set() +_log_debug = logger.debug +_log_info = logger.info + + +def _log_warning(msg, *args): + if msg not in _log_cache: + _log_cache.add(msg) + logger.warning(msg, *args) + + +def _log_error(msg, *args): + if msg not in _log_cache: + _log_cache.add(msg) + logger.error(msg, *args) From 790dd365e15a21fc5caeabec9b9259167f4f6f12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 12 May 2023 16:28:19 +0200 Subject: [PATCH 080/252] [postprocessor:exec] support tilde expansion for 'command' https://github.com/mikf/gallery-dl/issues/146#issuecomment-1544733532 --- gallery_dl/postprocessor/exec.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py index e81c6cfe27..39188f1684 100644 --- a/gallery_dl/postprocessor/exec.py +++ b/gallery_dl/postprocessor/exec.py @@ -11,6 +11,7 @@ from .common import PostProcessor from .. import util, formatter import subprocess +import os if util.WINDOWS: @@ -60,6 +61,7 @@ def exec_list(self, pathfmt, status=None): kwdict["_path"] = pathfmt.realpath args = [arg.format_map(kwdict) for arg in self.args] + args[0] = os.path.expanduser(args[0]) self._exec(args, False) if archive: From e41e45ff6bf1994354346220b3c94c307dcda315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 13 May 2023 15:46:29 +0200 Subject: [PATCH 081/252] [gofile] add basic password support (#4056) --- gallery_dl/extractor/gofile.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py index b53ebbe3c7..3190725ea2 100644 --- a/gallery_dl/extractor/gofile.py +++ b/gallery_dl/extractor/gofile.py @@ -7,6 +7,7 @@ from .common import Extractor, Message from .. import text, exception from ..cache import memcache +import hashlib class GofileFolderExtractor(Extractor): @@ -66,6 +67,7 @@ def __init__(self, match): def items(self): recursive = self.config("recursive") + password = self.config("password") token = self.config("api-token") if not token: @@ -78,7 +80,7 @@ def items(self): token = self._get_website_token() self.website_token = token - folder = self._get_content(self.content_id) + folder = self._get_content(self.content_id, password) yield Message.Directory, folder num = 0 @@ -115,11 +117,14 @@ def _get_website_token(self): page = self.request(self.root + "/contents/files.html").text return text.extract(page, "websiteToken:", ",")[0].strip("\" ") - def _get_content(self, content_id): + def _get_content(self, content_id, password=None): + if password is not None: + password = hashlib.sha256(password.encode()).hexdigest() return self._api_request("getContent", { "contentId" : content_id, "token" : self.api_token, "websiteToken": self.website_token, + "password" : password, }) def _api_request(self, endpoint, params=None): From fd0e1ffd6ef2b8cb24a78681106cd7e7a38d8ba4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 14 May 2023 16:35:31 +0200 Subject: [PATCH 082/252] [danbooru] improve 75666cf9 (#4002) Search for direct post IDs instead of trying to replicate the same results as the initial request. --- gallery_dl/extractor/danbooru.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index ab23520a34..ded25d331d 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -105,16 +105,17 @@ def _pagination(self, endpoint, params, pages=False): posts = posts["posts"] if self.includes and posts: - if not pages and "only" not in params: - params["page"] = "b{}".format(posts[0]["id"] + 1) - params["only"] = self.includes + params_meta = { + "only" : self.includes, + "limit": len(posts), + "tags" : "id:" + ",".join(str(p["id"]) for p in posts), + } data = { meta["id"]: meta - for meta in self.request(url, params=params).json() + for meta in self.request(url, params=params_meta).json() } for post in posts: post.update(data[post["id"]]) - params["only"] = None yield from posts @@ -155,7 +156,7 @@ def _ugoira_frames(self, post): "aibooru": { "root": None, "pattern": r"(?:safe.)?aibooru\.online", - } + }, }) From 494acabd38fefdc8c1b482b584695aaa788f9112 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 14 May 2023 18:39:59 +0200 Subject: [PATCH 083/252] [danbooru] refactor pagination logic (#4002) - only use 'b' when no other order is specified - support 'a' when using 'order:id' as tag --- gallery_dl/extractor/danbooru.py | 73 +++++++++++++++++++------------- gallery_dl/extractor/e621.py | 8 +--- 2 files changed, 46 insertions(+), 35 deletions(-) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index ded25d331d..5cfbf5c481 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -94,43 +94,47 @@ def metadata(self): def posts(self): return () - def _pagination(self, endpoint, params, pages=False): + def _pagination(self, endpoint, params, prefix=None): url = self.root + endpoint params["limit"] = self.per_page params["page"] = self.page_start + first = True while True: posts = self.request(url, params=params).json() - if "posts" in posts: + if isinstance(posts, dict): posts = posts["posts"] - if self.includes and posts: - params_meta = { - "only" : self.includes, - "limit": len(posts), - "tags" : "id:" + ",".join(str(p["id"]) for p in posts), - } - data = { - meta["id"]: meta - for meta in self.request(url, params=params_meta).json() - } - for post in posts: - post.update(data[post["id"]]) - - yield from posts + if posts: + if self.includes: + params_meta = { + "only" : self.includes, + "limit": len(posts), + "tags" : "id:" + ",".join(str(p["id"]) for p in posts), + } + data = { + meta["id"]: meta + for meta in self.request( + url, params=params_meta).json() + } + for post in posts: + post.update(data[post["id"]]) + + if prefix == "a" and not first: + posts.reverse() + + yield from posts if len(posts) < self.threshold: return - if pages: + if prefix: + params["page"] = "{}{}".format(prefix, posts[-1]["id"]) + elif params["page"]: params["page"] += 1 else: - for post in reversed(posts): - if "id" in post: - params["page"] = "b{}".format(post["id"]) - break - else: - return + params["page"] = 2 + first = False def _ugoira_frames(self, post): data = self.request("{}/posts/{}.json?only=media_metadata".format( @@ -203,7 +207,21 @@ def metadata(self): return {"search_tags": self.tags} def posts(self): - return self._pagination("/posts.json", {"tags": self.tags}) + prefix = "b" + for tag in self.tags.split(): + if tag.startswith("order:"): + if tag == "order:id" or tag == "order:id_asc": + prefix = "a" + elif tag == "order:id_desc": + prefix = "b" + else: + prefix = None + elif tag.startswith( + ("id:", "md5", "ordfav:", "ordfavgroup:", "ordpool:")): + prefix = None + break + + return self._pagination("/posts.json", {"tags": self.tags}, prefix) class DanbooruPoolExtractor(DanbooruExtractor): @@ -237,7 +255,7 @@ def metadata(self): def posts(self): params = {"tags": "pool:" + self.pool_id} - return self._pagination("/posts.json", params) + return self._pagination("/posts.json", params, "b") class DanbooruPostExtractor(DanbooruExtractor): @@ -311,7 +329,4 @@ def metadata(self): return {"date": date, "scale": scale} def posts(self): - if self.page_start is None: - self.page_start = 1 - return self._pagination( - "/explore/posts/popular.json", self.params, True) + return self._pagination("/explore/posts/popular.json", self.params) diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index 65ef1e1c1a..d4f6cd4b35 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -219,9 +219,7 @@ class E621PopularExtractor(E621Extractor, danbooru.DanbooruPopularExtractor): ) def posts(self): - if self.page_start is None: - self.page_start = 1 - return self._pagination("/popular.json", self.params, True) + return self._pagination("/popular.json", self.params) class E621FavoriteExtractor(E621Extractor): @@ -252,6 +250,4 @@ def metadata(self): return {"user_id": self.query.get("user_id", "")} def posts(self): - if self.page_start is None: - self.page_start = 1 - return self._pagination("/favorites.json", self.query, True) + return self._pagination("/favorites.json", self.query) From 56db930c47f492e395e33a03b704f0afde522758 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 14 May 2023 18:43:37 +0200 Subject: [PATCH 084/252] don't include 'toml' package in PyInstaller binary My own system now has Python 3.11 with native TOML support. --- scripts/release.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/release.sh b/scripts/release.sh index c675713d81..f32c796d05 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -59,7 +59,7 @@ build-linux() { rm -rf "${VENV_PATH}" python -m virtualenv "${VENV_PATH}" - $VENV_PYTHON -m pip install requests requests[socks] yt-dlp pyyaml toml secretstorage pyinstaller + $VENV_PYTHON -m pip install requests requests[socks] yt-dlp pyyaml secretstorage pyinstaller $VENV_PYTHON ./scripts/pyinstaller.py } From 6b6bb4be73c80a124301d76d6fd422710a34fbfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 14 May 2023 18:45:37 +0200 Subject: [PATCH 085/252] [weibo] require numeric IDs to have length >= 10 (#4059) --- gallery_dl/extractor/weibo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 388ee035f2..2cbfad6e7b 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -132,7 +132,7 @@ def _status_by_id(self, status_id): return self.request(url).json() def _user_id(self): - if self.user.isdecimal(): + if len(self.user) >= 10 and self.user.isdecimal(): return self.user[-10:] else: url = "{}/ajax/profile/info?{}={}".format( From 5037013e2b2d21a0cf71ee4269fb0f49294ef138 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 15 May 2023 14:59:44 +0200 Subject: [PATCH 086/252] [gofile] update 'website-token' (#4056) --- docs/configuration.rst | 2 +- gallery_dl/extractor/gofile.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 6d5ea7c231..bb89b7c1e7 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1687,7 +1687,7 @@ extractor.gofile.website-token Type ``string`` Default - ``"12345"`` + ``"abcde"`` Description API token value used during API requests. diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py index 3190725ea2..4d18f3d2b3 100644 --- a/gallery_dl/extractor/gofile.py +++ b/gallery_dl/extractor/gofile.py @@ -75,7 +75,7 @@ def items(self): self.session.cookies.set("accountToken", token, domain=".gofile.io") self.api_token = token - token = self.config("website-token", "12345") + token = self.config("website-token", "abcde") if not token: token = self._get_website_token() self.website_token = token @@ -114,8 +114,8 @@ def _create_account(self): @memcache() def _get_website_token(self): self.log.debug("Fetching website token") - page = self.request(self.root + "/contents/files.html").text - return text.extract(page, "websiteToken:", ",")[0].strip("\" ") + page = self.request(self.root + "/dist/js/alljs.js").text + return text.extr(page, 'fetchData.websiteToken = "', '"') def _get_content(self, content_id, password=None): if password is not None: From 011e4607c3952c5761e6014377ddbdde53f10bf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 15 May 2023 23:16:31 +0200 Subject: [PATCH 087/252] [poipiku] extract full 'descriptions' (#4066) don't cut it off after the first line --- gallery_dl/extractor/poipiku.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py index 49da9ce347..14c25c4492 100644 --- a/gallery_dl/extractor/poipiku.py +++ b/gallery_dl/extractor/poipiku.py @@ -41,7 +41,7 @@ def items(self): "user_name" : text.unescape(extr( '

    ', '")[2]), "description": text.unescape(extr( - 'class="IllustItemDesc" >', '<')), + 'class="IllustItemDesc" >', '

    ')), "_http_headers": {"Referer": post_url}, } @@ -172,7 +172,9 @@ class PoipikuPostExtractor(PoipikuExtractor): "count": 3, "keyword": { "count": "3", - "description": "ORANGE OASISボスネタバレ", + "description": "ORANGE OASISボスネタバレ
    曲も大好き
    " + "2枚目以降はほとんど見えなかった1枚目背景" + "のヒエログリフ小ネタです𓀀", "num": int, "post_category": "SPOILER", "post_id": "5776587", From 82a12d6126bf9ee452a2fc6ece3043d19d33e325 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 17 May 2023 15:18:10 +0200 Subject: [PATCH 088/252] [nsfwalbum] detect placeholder images patch by an anonymous contributor --- gallery_dl/extractor/nsfwalbum.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/nsfwalbum.py b/gallery_dl/extractor/nsfwalbum.py index be736d10ba..6433fbd2d0 100644 --- a/gallery_dl/extractor/nsfwalbum.py +++ b/gallery_dl/extractor/nsfwalbum.py @@ -75,7 +75,8 @@ def images(self, page): @staticmethod def _validate_response(response): - return not response.request.url.endswith("/no_image.jpg") + return not response.request.url.endswith( + ("/no_image.jpg", "/placeholder.png")) @staticmethod def _annihilate(value, base=6): From 04dbfd994e34163bca922453e0aee47eb85886ca Mon Sep 17 00:00:00 2001 From: Naatie Date: Sun, 23 Apr 2023 15:24:25 +0700 Subject: [PATCH 089/252] [misskey] add my favorites extractor --- gallery_dl/extractor/misskey.py | 38 ++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/misskey.py b/gallery_dl/extractor/misskey.py index 03e9104547..7e5ef799d3 100644 --- a/gallery_dl/extractor/misskey.py +++ b/gallery_dl/extractor/misskey.py @@ -7,7 +7,7 @@ """Extractors for Misskey instances""" from .common import BaseExtractor, Message -from .. import text +from .. import text, exception class MisskeyExtractor(BaseExtractor): @@ -152,6 +152,33 @@ def notes(self): return (self.api.notes_show(self.item),) +class MisskeyMyFavoritesExtractor(MisskeyExtractor): + """Extractor for images from favorites""" + subcategory = "favorites" + pattern = BASE_PATTERN + r"(/my/favorites|/api/i/favorites)" + test = ( + ("https://misskey.io/my/favorites",), + ("https://misskey.io/api/i/favorites",), + ) + + def items(self): + for fav in self.api.i_favorites(): + note = fav.get("note") + note["instance"] = self.instance + note["instance_remote"] = note["user"]["host"] + note["count"] = len(note["files"]) + note["date"] = text.parse_datetime( + note["createdAt"], "%Y-%m-%dT%H:%M:%S.%f%z") + + yield Message.Directory, note + for note["num"], file in enumerate(note["files"], 1): + file["date"] = text.parse_datetime( + file["createdAt"], "%Y-%m-%dT%H:%M:%S.%f%z") + note["file"] = file + url = file["url"] + yield Message.Url, url, text.nameext_from_url(url, note) + + class MisskeyAPI(): """Interface for Misskey API @@ -164,6 +191,7 @@ def __init__(self, extractor): self.root = extractor.root self.extractor = extractor self.headers = {"Content-Type": "application/json"} + self.access_token = extractor.config("access-token") def user_id_by_username(self, username): endpoint = "/users/show" @@ -187,6 +215,14 @@ def notes_show(self, note_id): data = {"noteId": note_id} return self._call(endpoint, data) + def i_favorites(self): + endpoint = "/i/favorites" + data = {} + if not self.access_token: + raise exception.AuthenticationError() + data["i"] = self.access_token + return self._pagination(endpoint, data) + def _call(self, endpoint, data): url = self.root + "/api" + endpoint return self.extractor.request( From f9b7a033e02e9be5225f3c85fba31bfe08e7a059 Mon Sep 17 00:00:00 2001 From: Naatie Date: Wed, 26 Apr 2023 14:44:47 +0700 Subject: [PATCH 090/252] [misskey] refactor misskey extractor --- gallery_dl/extractor/misskey.py | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/gallery_dl/extractor/misskey.py b/gallery_dl/extractor/misskey.py index 7e5ef799d3..26b7f19483 100644 --- a/gallery_dl/extractor/misskey.py +++ b/gallery_dl/extractor/misskey.py @@ -27,6 +27,8 @@ def __init__(self, match): def items(self): for note in self.notes(): + if "note" in note: + note = note["note"] files = note.pop("files") or [] renote = note.get("renote") if renote: @@ -157,26 +159,12 @@ class MisskeyMyFavoritesExtractor(MisskeyExtractor): subcategory = "favorites" pattern = BASE_PATTERN + r"(/my/favorites|/api/i/favorites)" test = ( - ("https://misskey.io/my/favorites",), - ("https://misskey.io/api/i/favorites",), + ("https://misskey.io/my/favorites"), + ("https://misskey.io/api/i/favorites"), ) - def items(self): - for fav in self.api.i_favorites(): - note = fav.get("note") - note["instance"] = self.instance - note["instance_remote"] = note["user"]["host"] - note["count"] = len(note["files"]) - note["date"] = text.parse_datetime( - note["createdAt"], "%Y-%m-%dT%H:%M:%S.%f%z") - - yield Message.Directory, note - for note["num"], file in enumerate(note["files"], 1): - file["date"] = text.parse_datetime( - file["createdAt"], "%Y-%m-%dT%H:%M:%S.%f%z") - note["file"] = file - url = file["url"] - yield Message.Url, url, text.nameext_from_url(url, note) + def notes(self): + return self.api.i_favorites() class MisskeyAPI(): @@ -217,10 +205,9 @@ def notes_show(self, note_id): def i_favorites(self): endpoint = "/i/favorites" - data = {} if not self.access_token: raise exception.AuthenticationError() - data["i"] = self.access_token + data = {"i": self.access_token} return self._pagination(endpoint, data) def _call(self, endpoint, data): From d680623db369bc4acaa82f7118f81607937394e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 18 May 2023 22:34:33 +0200 Subject: [PATCH 091/252] [instagram] add 'order-files' option (#4017, #3993) --- docs/configuration.rst | 17 +++++++++++++++++ gallery_dl/extractor/instagram.py | 5 +++++ 2 files changed, 22 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index bb89b7c1e7..1ce75cb727 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1817,6 +1817,23 @@ Description It is possible to use ``"all"`` instead of listing all values separately. +extractor.instagram.order-files +------------------------------- +Type + ``string`` +Default + ``"asc"`` +Description + Controls the order in which files of each post are returned. + + * ``"asc"``: Same order as displayed in a post + * ``"desc"``: Reverse order as displayed in a post + * ``"reverse"``: Same as ``"desc"`` + + Note: This option does *not* affect ``{num}``. + To enumerate files in reverse order, use ``count - num + 1``. + + extractor.instagram.previews ---------------------------- Type diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 4c1be0fbb3..3f760ebb5e 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -55,6 +55,9 @@ def items(self): previews = self.config("previews", False) video_headers = {"User-Agent": "Mozilla/5.0"} + order = self.config("order-files") + reverse = order[0] in ("r", "d") if order else False + for post in self.posts(): if "__typename" in post: @@ -71,6 +74,8 @@ def items(self): if "date" in post: del post["date"] + if reverse: + files.reverse() for file in files: file.update(post) From a83983c6515520cc17bd68947d51c03d2937c07d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 18 May 2023 22:50:04 +0200 Subject: [PATCH 092/252] [instagram] add 'order-posts' option (#4017, #3993) --- docs/configuration.rst | 18 ++++++++++++++++++ docs/gallery-dl.conf | 3 +++ gallery_dl/extractor/instagram.py | 14 ++++++++++++-- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 1ce75cb727..4b156a1ea2 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1834,6 +1834,24 @@ Description To enumerate files in reverse order, use ``count - num + 1``. +extractor.instagram.order-posts +------------------------------- +Type + ``string`` +Default + ``"asc"`` +Description + Controls the order in which posts are returned. + + * ``"asc"``: Same order as displayed + * ``"desc"``: Reverse order as displayed + * ``"id"`` or ``"id_asc"``: Ascending order by ID + * ``"id_desc"``: Descending order by ID + * ``"reverse"``: Same as ``"desc"`` + + Note: This option only affects ``highlights``. + + extractor.instagram.previews ---------------------------- Type diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 92451fda16..b47a02ff0d 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -166,6 +166,9 @@ "api": "rest", "cookies": null, "include": "posts", + "order-files": "asc", + "order-posts": "asc", + "previews": false, "sleep-request": [6.0, 12.0], "videos": true }, diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 3f760ebb5e..1e1de9442b 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -761,10 +761,20 @@ def guide_media(self, guide_id): endpoint = "/v1/guides/guide/{}/".format(guide_id) return self._pagination_guides(endpoint) - def highlights_media(self, user_id): - chunk_size = 5 + def highlights_media(self, user_id, chunk_size=5): reel_ids = [hl["id"] for hl in self.highlights_tray(user_id)] + order = self.extractor.config("order-posts") + if order: + if order in ("desc", "reverse"): + reel_ids.reverse() + elif order in ("id", "id_asc"): + reel_ids.sort(key=lambda r: int(r[10:])) + elif order == "id_desc": + reel_ids.sort(key=lambda r: int(r[10:]), reverse=True) + elif order != "asc": + self.extractor.log.warning("Unknown posts order '%s'", order) + for offset in range(0, len(reel_ids), chunk_size): yield from self.reels_media( reel_ids[offset : offset+chunk_size]) From e3fed9bd17e86a466c56fdbb2249aba483ab799e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 20 May 2023 13:58:59 +0200 Subject: [PATCH 093/252] [tcbscans] update domain to 'tcbscans.com' (#4080) --- docs/supportedsites.md | 2 +- gallery_dl/extractor/tcbscans.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index f98508ca67..e42cdf675e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -813,7 +813,7 @@ Consider all sites to be NSFW unless otherwise known. TCB Scans - https://onepiecechapters.com/ + https://tcbscans.com/ Chapters, Manga diff --git a/gallery_dl/extractor/tcbscans.py b/gallery_dl/extractor/tcbscans.py index cac5a545b3..b5a730a4bb 100644 --- a/gallery_dl/extractor/tcbscans.py +++ b/gallery_dl/extractor/tcbscans.py @@ -4,19 +4,20 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://onepiecechapters.com/""" +"""Extractors for https://tcbscans.com/""" from .common import ChapterExtractor, MangaExtractor from .. import text +BASE_PATTERN = r"(?:https?://)?(?:tcbscans|onepiecechapters)\.com" + class TcbscansChapterExtractor(ChapterExtractor): category = "tcbscans" - pattern = (r"(?:https?://)?onepiecechapters\.com" - r"(/chapters/\d+/[^/?#]+)") - root = "https://onepiecechapters.com" + root = "https://tcbscans.com" + pattern = BASE_PATTERN + r"(/chapters/\d+/[^/?#]+)" test = ( - (("https://onepiecechapters.com" + (("https://tcbscans.com" "/chapters/4708/chainsaw-man-chapter-108"), { "pattern": (r"https://cdn\.[^/]+" r"/(file|attachments/[^/]+)/[^/]+/[^.]+\.\w+"), @@ -66,12 +67,11 @@ def metadata(self, page): class TcbscansMangaExtractor(MangaExtractor): category = "tcbscans" + root = "https://tcbscans.com" chapterclass = TcbscansChapterExtractor - pattern = (r"(?:https?://)?onepiecechapters\.com" - r"(/mangas/\d+/[^/?#]+)") - root = "https://onepiecechapters.com" + pattern = BASE_PATTERN + r"(/mangas/\d+/[^/?#]+)" test = ( - ("https://onepiecechapters.com/mangas/13/chainsaw-man", { + ("https://tcbscans.com/mangas/13/chainsaw-man", { "pattern": TcbscansChapterExtractor.pattern, "range" : "1-50", "count" : 50, From 77abcf5ab3f8c3380217f7ad9a445a308b8d51cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 20 May 2023 16:58:21 +0200 Subject: [PATCH 094/252] [gofile] automatically fetch 'website-token' by default the hardcoded token changed yet again --- docs/configuration.rst | 9 ++++----- docs/gallery-dl.conf | 2 +- gallery_dl/extractor/gofile.py | 10 ++++------ 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 4b156a1ea2..32b75d7431 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1686,15 +1686,14 @@ extractor.gofile.website-token ------------------------------ Type ``string`` -Default - ``"abcde"`` Description API token value used during API requests. - A not up-to-date value will result in ``401 Unauthorized`` errors. + An invalid or not up-to-date value + will result in ``401 Unauthorized`` errors. - Setting this value to ``null`` will do an extra HTTP request to fetch - the current value used by gofile. + Keeping this option unset will use an extra HTTP request + to attempt to fetch the current value used by gofile. extractor.gofile.recursive diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index b47a02ff0d..057eaed539 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -129,7 +129,7 @@ }, "gofile": { "api-token": null, - "website-token": "12345" + "website-token": null }, "hentaifoundry": { diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py index 4d18f3d2b3..044dddbd8d 100644 --- a/gallery_dl/extractor/gofile.py +++ b/gallery_dl/extractor/gofile.py @@ -6,7 +6,7 @@ from .common import Extractor, Message from .. import text, exception -from ..cache import memcache +from ..cache import cache, memcache import hashlib @@ -75,10 +75,8 @@ def items(self): self.session.cookies.set("accountToken", token, domain=".gofile.io") self.api_token = token - token = self.config("website-token", "abcde") - if not token: - token = self._get_website_token() - self.website_token = token + self.website_token = (self.config("website-token") or + self._get_website_token()) folder = self._get_content(self.content_id, password) yield Message.Directory, folder @@ -111,7 +109,7 @@ def _create_account(self): self.log.debug("Creating temporary account") return self._api_request("createAccount")["token"] - @memcache() + @cache(maxage=86400) def _get_website_token(self): self.log.debug("Fetching website token") page = self.request(self.root + "/dist/js/alljs.js").text From e6f55d155588c026fb0e34140d8bfa0f5f8dca51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 20 May 2023 16:21:11 +0200 Subject: [PATCH 095/252] [imagechest] add API support and 'access-token' option (#4065) --- docs/configuration.rst | 15 +++++++ docs/gallery-dl.conf | 3 ++ gallery_dl/extractor/imagechest.py | 72 ++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index 32b75d7431..5088544885 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1741,6 +1741,21 @@ Description but is most likely going to fail with ``403 Forbidden`` errors. +extractor.imagechest.access-token +--------------------------------- +Type + ``string`` +Description + Your personal Image Chest access token. + + These tokens allow using the API instead of having to scrape HTML pages, + providing more detailed metadata. + (``date``, ``description``, etc) + + See https://imgchest.com/docs/api/1.0/general/authorization + for instructions on how to generate such a token. + + extractor.imgur.client-id ------------------------- Type diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 057eaed539..6d4c4af0dc 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -146,6 +146,9 @@ "password": null, "sleep-request": 5.0 }, + "imagechest": { + "access-token": null + }, "imgbb": { "username": null, diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py index 086b95d893..97a621b076 100644 --- a/gallery_dl/extractor/imagechest.py +++ b/gallery_dl/extractor/imagechest.py @@ -37,14 +37,23 @@ class ImagechestGalleryExtractor(GalleryExtractor): "url": "f5674e8ba79d336193c9f698708d9dcc10e78cc7", "count": 52, }), + ("https://imgchest.com/p/xxxxxxxxxxx", { + "exception": exception.NotFoundError, + }), ) def __init__(self, match): self.gallery_id = match.group(1) url = self.root + "/p/" + self.gallery_id GalleryExtractor.__init__(self, match, url) + self.access_token = self.config("access-token") def metadata(self, page): + if self.access_token: + return self._metadata_api() + return self._metadata_html(page) + + def _metadata_html(self, page): if "Sorry, but the page you requested could not be found." in page: raise exception.NotFoundError("gallery") @@ -54,7 +63,28 @@ def metadata(self, page): page, 'property="og:title" content="', '"').strip()) } + def _metadata_api(self): + api = ImagechestAPI(self, self.access_token) + post = api.post(self.gallery_id) + + post["date"] = text.parse_datetime( + post["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + for img in post["images"]: + img["date"] = text.parse_datetime( + img["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + + post["gallery_id"] = self.gallery_id + post.pop("image_count", None) + self._image_list = post.pop("images") + + return post + def images(self, page): + if self.access_token: + return self._images_api() + return self._images_html(page) + + def _images_html(self, page): if " More Files" in page: url = "{}/p/{}/loadAll".format(self.root, self.gallery_id) headers = { @@ -71,3 +101,45 @@ def images(self, page): (url, None) for url in text.extract_iter(page, 'data-url="', '"') ] + + def _images_api(self): + return [ + (img["link"], img) + for img in self._image_list + ] + + +class ImagechestAPI(): + """Interface for the Image Chest API + + https://imgchest.com/docs/api/1.0/general/overview + """ + root = "https://api.imgchest.com" + + def __init__(self, extractor, access_token): + self.extractor = extractor + self.headers = {"Authorization": "Bearer " + access_token} + + def post(self, post_id): + endpoint = "/v1/post/" + post_id + return self._call(endpoint) + + def _call(self, endpoint): + url = self.root + endpoint + + while True: + response = self.extractor.request( + url, headers=self.headers, fatal=None, allow_redirects=False) + + if response.status_code < 300: + return response.json()["data"] + + elif response.status_code < 400: + raise exception.AuthenticationError("Invalid API access token") + + elif response.status_code == 429: + self.extractor.wait(seconds=600) + + else: + self.extractor.log.debug(response.text) + raise exception.StopExtraction("API request failed") From 56b8b8cd360f5490bf071d3e4331618ff669c4e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 21 May 2023 14:26:30 +0200 Subject: [PATCH 096/252] [pixiv] support short novel URLs https://www.pixiv.net/n/ --- gallery_dl/extractor/pixiv.py | 73 ++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 35 deletions(-) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 2b5a62a6ad..96d2fcf690 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -756,42 +756,45 @@ class PixivNovelExtractor(PixivExtractor): subcategory = "novel" request_interval = 1.0 pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/novel/show\.php\?id=(\d+)") - test = ("https://www.pixiv.net/novel/show.php?id=19612040", { - "count": 1, - "content": "c6f22167f9df7aeaf63b51933b4c8ef6fc5e6a1e", - "keyword": { - "caption": r"re:「無能な名無し」と呼ばれ虐げられて育った鈴\(すず\)は、", - "comment_access_control": 0, - "create_date": "2023-04-02T15:18:58+09:00", - "date": "dt:2023-04-02 06:18:58", - "id": 19612040, - "is_bookmarked": False, - "is_muted": False, - "is_mypixiv_only": False, - "is_original": True, - "is_x_restricted": False, - "novel_ai_type": 1, - "page_count": 1, - "rating": "General", - "restrict": 0, - "series": { - "id": 10278364, - "title": "龍の贄嫁〜虐げられた少女は運命の番として愛される〜" - }, - "tags": ["和風ファンタジー", "溺愛", "神様", "ヤンデレ", "執着", - "異能", "ざまぁ", "学園", "神嫁"], - "text_length": 5977, - "title": "異母妹から「無能な名無し」と虐げられていた私、" - "どうやら異母妹に霊力を搾取されていたようです(1)", - "user": { - "account": "yukinaga_chifuyu", - "id": 77055466, + r"/n(?:ovel/show\.php\?id=|/)(\d+)") + test = ( + ("https://www.pixiv.net/novel/show.php?id=19612040", { + "count": 1, + "content": "c6f22167f9df7aeaf63b51933b4c8ef6fc5e6a1e", + "keyword": { + "caption": r"re:「無能な名無し」と呼ばれ虐げられて育った鈴\(すず\)は、", + "comment_access_control": 0, + "create_date": "2023-04-02T15:18:58+09:00", + "date": "dt:2023-04-02 06:18:58", + "id": 19612040, + "is_bookmarked": False, + "is_muted": False, + "is_mypixiv_only": False, + "is_original": True, + "is_x_restricted": False, + "novel_ai_type": 1, + "page_count": 1, + "rating": "General", + "restrict": 0, + "series": { + "id": 10278364, + "title": "龍の贄嫁〜虐げられた少女は運命の番として愛される〜" + }, + "tags": ["和風ファンタジー", "溺愛", "神様", "ヤンデレ", "執着", + "異能", "ざまぁ", "学園", "神嫁"], + "text_length": 5977, + "title": "異母妹から「無能な名無し」と虐げられていた私、" + "どうやら異母妹に霊力を搾取されていたようです(1)", + "user": { + "account": "yukinaga_chifuyu", + "id": 77055466, + }, + "visible": True, + "x_restrict": 0, }, - "visible": True, - "x_restrict": 0, - }, - }) + }), + ("https://www.pixiv.net/n/19612040"), + ) def __init__(self, match): PixivExtractor.__init__(self, match) From 69865dcc0567807fc0921337a9a0879610e103a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 22 May 2023 18:30:45 +0200 Subject: [PATCH 097/252] [formatter] implement slicing strings as bytes (#4087) prefixing a slice '[10:30]' with a lowercase b '[b10:30]' encodes the string to bytes in filesystem encoding before applying the slice --- docs/formatting.md | 23 +++++++++++++++-------- gallery_dl/formatter.py | 30 ++++++++++++++++++++++++++---- test/test_formatter.py | 29 +++++++++++++++++++++++++---- 3 files changed, 66 insertions(+), 16 deletions(-) diff --git a/docs/formatting.md b/docs/formatting.md index cc2703d2f3..7c571fdde2 100644 --- a/docs/formatting.md +++ b/docs/formatting.md @@ -11,14 +11,15 @@ Field names select the metadata value to use in a replacement field. While simple names are usually enough, more complex forms like accessing values by attribute, element index, or slicing are also supported. -| | Example | Result | -| -------------------- | ----------------- | ---------------------- | -| Name | `{title}` | `Hello World` | -| Element Index | `{title[6]}` | `W` | -| Slicing | `{title[3:8]}` | `lo Wo` | -| Alternatives | `{empty\|title}` | `Hello World` | -| Element Access | `{user[name]}` | `John Doe` | -| Attribute Access | `{extractor.url}` | `https://example.org/` | +| | Example | Result | +| -------------------- | ------------------- | ---------------------- | +| Name | `{title}` | `Hello World` | +| Element Index | `{title[6]}` | `W` | +| Slicing | `{title[3:8]}` | `lo Wo` | +| Slicing (Bytes) | `{title_ja[b3:18]}` | `ロー・ワー` | +| Alternatives | `{empty\|title}` | `Hello World` | +| Element Access | `{user[name]}` | `John Doe` | +| Attribute Access | `{extractor.url}` | `https://example.org/` | All of these methods can be combined as needed. For example `{title[24]|empty|extractor.url[15:-1]}` would result in `.org`. @@ -150,6 +151,12 @@ Format specifiers can be used for advanced formatting by using the options provi {foo:[1:-1]} oo Ba + + [b<start>:<stop>] + Same as above, but applies to the bytes() representation of a string in filesystem encoding + {foo_ja:[b3:-1]} + ー・バ + L<maxlen>/<repl>/ Replaces the entire output with <repl> if its length exceeds <maxlen> diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index fc36fa2c1f..2ff48c321e 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -9,6 +9,7 @@ """String formatters""" import os +import sys import time import string import _string @@ -255,7 +256,11 @@ def parse_field_name(field_name): func = operator.itemgetter try: if ":" in key: - key = _slice(key) + if key[0] == "b": + func = _bytesgetter + key = _slice(key[1:]) + else: + key = _slice(key) else: key = key.strip("\"'") except TypeError: @@ -276,6 +281,14 @@ def _slice(indices): ) +def _bytesgetter(slice, encoding=sys.getfilesystemencoding()): + + def apply_slice_bytes(obj): + return obj.encode(encoding)[slice].decode(encoding, "ignore") + + return apply_slice_bytes + + def _build_format_func(format_spec, default): if format_spec: return _FORMAT_SPECIFIERS.get( @@ -295,11 +308,20 @@ def optional(obj): def _parse_slice(format_spec, default): indices, _, format_spec = format_spec.partition("]") - slice = _slice(indices[1:]) fmt = _build_format_func(format_spec, default) - def apply_slice(obj): - return fmt(obj[slice]) + if indices[1] == "b": + slice_bytes = _bytesgetter(_slice(indices[2:])) + + def apply_slice(obj): + return fmt(slice_bytes(obj)) + + else: + slice = _slice(indices[1:]) + + def apply_slice(obj): + return fmt(obj[slice]) + return apply_slice diff --git a/test/test_formatter.py b/test/test_formatter.py index 225896688d..1bda9d9c67 100644 --- a/test/test_formatter.py +++ b/test/test_formatter.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -23,6 +23,7 @@ class TestFormatter(unittest.TestCase): kwdict = { "a": "hElLo wOrLd", "b": "äöü", + "j": "げんそうきょう", "d": {"a": "foo", "b": 0, "c": None}, "l": ["a", "b", "c"], "n": None, @@ -133,7 +134,7 @@ def test_dict_access(self): self._run_test("{d['a']}", "foo") self._run_test('{d["a"]}', "foo") - def test_slicing(self): + def test_slice_str(self): v = self.kwdict["a"] self._run_test("{a[1:10]}" , v[1:10]) self._run_test("{a[-10:-1]}", v[-10:-1]) @@ -165,6 +166,26 @@ def test_slicing(self): self._run_test("{a:[:50:2]}", v[:50:2]) self._run_test("{a:[::]}" , v) + def test_slice_bytes(self): + v = self.kwdict["j"] + self._run_test("{j[b1:10]}" , v[1:3]) + self._run_test("{j[b-10:-1]}", v[-3:-1]) + self._run_test("{j[b5:]}" , v[2:]) + self._run_test("{j[b50:]}" , v[50:]) + self._run_test("{j[b:5]}" , v[:1]) + self._run_test("{j[b:50]}" , v[:50]) + self._run_test("{j[b:]}" , v) + self._run_test("{j[b::]}" , v) + + self._run_test("{j:[b1:10]}" , v[1:3]) + self._run_test("{j:[b-10:-1]}", v[-3:-1]) + self._run_test("{j:[b5:]}" , v[2:]) + self._run_test("{j:[b50:]}" , v[50:]) + self._run_test("{j:[b:5]}" , v[:1]) + self._run_test("{j:[b:50]}" , v[:50]) + self._run_test("{j:[b:]}" , v) + self._run_test("{j:[b::]}" , v) + def test_maxlen(self): v = self.kwdict["a"] self._run_test("{a:L5/foo/}" , "foo") @@ -413,10 +434,10 @@ def noarg(): fmt4 = formatter.parse("\fM " + path + ":lengths") self.assertEqual(fmt1.format_map(self.kwdict), "'Title' by Name") - self.assertEqual(fmt2.format_map(self.kwdict), "89") + self.assertEqual(fmt2.format_map(self.kwdict), "96") self.assertEqual(fmt3.format_map(self.kwdict), "'Title' by Name") - self.assertEqual(fmt4.format_map(self.kwdict), "89") + self.assertEqual(fmt4.format_map(self.kwdict), "96") with self.assertRaises(TypeError): self.assertEqual(fmt0.format_map(self.kwdict), "") From 4fc9675d485b2beabb9d9579a34104f9410b2430 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 22 May 2023 18:49:06 +0200 Subject: [PATCH 098/252] [fanbox] skip 404ed or otherwise invalid posts (#4088) --- gallery_dl/extractor/fanbox.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 57c433389a..4ca0852586 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -52,8 +52,11 @@ def _pagination(self, url): url = text.ensure_http_scheme(url) body = self.request(url, headers=headers).json()["body"] for item in body["items"]: - yield self._get_post_data(item["id"]) - + try: + yield self._get_post_data(item["id"]) + except Exception as exc: + self.log.warning("Skipping post %s (%s: %s)", + item["id"], exc.__class__.__name__, exc) url = body["nextUrl"] def _get_post_data(self, post_id): From 9810ab35afc8777b3a2c522d60d5d49774659edd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 22 May 2023 22:24:23 +0200 Subject: [PATCH 099/252] [docs] update formatting.md - add element access with quotes - fix \f formatting - remove note about typing \f in a shell (9e6c9813) --- docs/formatting.md | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/docs/formatting.md b/docs/formatting.md index 7c571fdde2..86abc3ef9a 100644 --- a/docs/formatting.md +++ b/docs/formatting.md @@ -18,8 +18,9 @@ While simple names are usually enough, more complex forms like accessing values | Slicing | `{title[3:8]}` | `lo Wo` | | Slicing (Bytes) | `{title_ja[b3:18]}` | `ロー・ワー` | | Alternatives | `{empty\|title}` | `Hello World` | -| Element Access | `{user[name]}` | `John Doe` | | Attribute Access | `{extractor.url}` | `https://example.org/` | +| Element Access | `{user[name]}` | `John Doe` | +| | `{user['name']}` | `John Doe` | All of these methods can be combined as needed. For example `{title[24]|empty|extractor.url[15:-1]}` would result in `.org`. @@ -200,7 +201,9 @@ Format specifiers can be used for advanced formatting by using the options provi -All special format specifiers (`?`, `L`, `J`, `R`, `D`, `O`) can be chained and combined with one another, but must always come before any standard format specifiers: +All special format specifiers (`?`, `L`, `J`, `R`, `D`, `O`, etc) +can be chained and combined with one another, +but must always appear before any standard format specifiers: For example `{foo:?//RF/B/Ro/e/> 10}` -> `   Bee Bar` - `?//` - Tests if `foo` has a value @@ -251,7 +254,7 @@ Replacement field names that are available in all format strings. ## Special Type Format Strings -Starting a format string with '\f ' allows to set a different format string type than the default. Available ones are: +Starting a format string with `\f ` allows to set a different format string type than the default. Available ones are: @@ -292,13 +295,3 @@ Starting a format string with '\f ' allows to set a different format strin
    - -> **Note:** -> -> `\f` is the [Form Feed](https://en.wikipedia.org/w/index.php?title=Page_break&oldid=1027475805#Form_feed) -> character. (ASCII code 12 or 0xc) -> -> Writing it as `\f` is native to JSON, but will *not* get interpreted -> as such by most shells. To use this character there: -> * hold `Ctrl`, then press `v` followed by `l`, resulting in `^L` or -> * use `echo` or `printf` (e.g. `gallery-dl -f "$(echo -ne \\fM) my_module:generate_text"`) From 856f6c10cd14229593eb1df1d2a9d18d7b04c760 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 22 May 2023 22:29:30 +0200 Subject: [PATCH 100/252] allow for GalleryExtractors to skip loading gallery_url --- gallery_dl/extractor/common.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 09737ef996..50d1026c70 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -555,7 +555,13 @@ def __init__(self, match, url=None): def items(self): self.login() - page = self.request(self.gallery_url, notfound=self.subcategory).text + + if self.gallery_url: + page = self.request( + self.gallery_url, notfound=self.subcategory).text + else: + page = None + data = self.metadata(page) imgs = self.images(page) From d1f2ef3b7b348a753918d738e76a0bea7e1cb449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 22 May 2023 22:42:16 +0200 Subject: [PATCH 101/252] [imagechest] update - don't load HTML page when using API - restructure some code - add more methods to ImagechestAPI --- gallery_dl/extractor/imagechest.py | 57 ++++++++++++++++-------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py index 97a621b076..9229617bed 100644 --- a/gallery_dl/extractor/imagechest.py +++ b/gallery_dl/extractor/imagechest.py @@ -46,14 +46,14 @@ def __init__(self, match): self.gallery_id = match.group(1) url = self.root + "/p/" + self.gallery_id GalleryExtractor.__init__(self, match, url) - self.access_token = self.config("access-token") - def metadata(self, page): + self.access_token = self.config("access-token") if self.access_token: - return self._metadata_api() - return self._metadata_html(page) + self.gallery_url = None + self.metadata = self._metadata_api + self.images = self._images_api - def _metadata_html(self, page): + def metadata(self, page): if "Sorry, but the page you requested could not be found." in page: raise exception.NotFoundError("gallery") @@ -63,28 +63,7 @@ def _metadata_html(self, page): page, 'property="og:title" content="', '"').strip()) } - def _metadata_api(self): - api = ImagechestAPI(self, self.access_token) - post = api.post(self.gallery_id) - - post["date"] = text.parse_datetime( - post["created"], "%Y-%m-%dT%H:%M:%S.%fZ") - for img in post["images"]: - img["date"] = text.parse_datetime( - img["created"], "%Y-%m-%dT%H:%M:%S.%fZ") - - post["gallery_id"] = self.gallery_id - post.pop("image_count", None) - self._image_list = post.pop("images") - - return post - def images(self, page): - if self.access_token: - return self._images_api() - return self._images_html(page) - - def _images_html(self, page): if " More Files" in page: url = "{}/p/{}/loadAll".format(self.root, self.gallery_id) headers = { @@ -102,7 +81,23 @@ def _images_html(self, page): for url in text.extract_iter(page, 'data-url="', '"') ] - def _images_api(self): + def _metadata_api(self, page): + api = ImagechestAPI(self, self.access_token) + post = api.post(self.gallery_id) + + post["date"] = text.parse_datetime( + post["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + for img in post["images"]: + img["date"] = text.parse_datetime( + img["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + + post["gallery_id"] = self.gallery_id + post.pop("image_count", None) + self._image_list = post.pop("images") + + return post + + def _images_api(self, page): return [ (img["link"], img) for img in self._image_list @@ -120,10 +115,18 @@ def __init__(self, extractor, access_token): self.extractor = extractor self.headers = {"Authorization": "Bearer " + access_token} + def file(self, file_id): + endpoint = "/v1/file/" + file_id + return self._call(endpoint) + def post(self, post_id): endpoint = "/v1/post/" + post_id return self._call(endpoint) + def user(self, username): + endpoint = "/v1/user/" + username + return self._call(endpoint) + def _call(self, endpoint): url = self.root + endpoint From 3fca455b82ebcb4514ed16dd80a905fd9ae49225 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 23 May 2023 12:14:06 +0200 Subject: [PATCH 102/252] [pixiv] add 'embeds' option (#1241) --- docs/configuration.rst | 10 ++++++ docs/gallery-dl.conf | 1 + gallery_dl/extractor/pixiv.py | 60 +++++++++++++++++++++++++++++++++-- 3 files changed, 68 insertions(+), 3 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 5088544885..154f6b3e2b 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2404,6 +2404,16 @@ Description `gppt `__. +extractor.pixiv.embeds +---------------------- +Type + ``bool`` +Default + ``false`` +Description + Download images embedded in novels. + + extractor.pixiv.metadata ------------------------ Type diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 6d4c4af0dc..0b61742a09 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -245,6 +245,7 @@ { "refresh-token": null, "include": "artworks", + "embeds": false, "metadata": false, "metadata-bookmark": false, "tags": "japanese", diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 96d2fcf690..cdaf595fa7 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -760,7 +760,7 @@ class PixivNovelExtractor(PixivExtractor): test = ( ("https://www.pixiv.net/novel/show.php?id=19612040", { "count": 1, - "content": "c6f22167f9df7aeaf63b51933b4c8ef6fc5e6a1e", + "content": "8c818474153cbd2f221ee08766e1d634c821d8b4", "keyword": { "caption": r"re:「無能な名無し」と呼ばれ虐げられて育った鈴\(すず\)は、", "comment_access_control": 0, @@ -778,11 +778,12 @@ class PixivNovelExtractor(PixivExtractor): "restrict": 0, "series": { "id": 10278364, - "title": "龍の贄嫁〜虐げられた少女は運命の番として愛される〜" + "title": "龍の贄嫁〜無能な名無しと虐げられていましたが、" + "どうやら異母妹に霊力を搾取されていたようです〜", }, "tags": ["和風ファンタジー", "溺愛", "神様", "ヤンデレ", "執着", "異能", "ざまぁ", "学園", "神嫁"], - "text_length": 5977, + "text_length": 5974, "title": "異母妹から「無能な名無し」と虐げられていた私、" "どうやら異母妹に霊力を搾取されていたようです(1)", "user": { @@ -793,6 +794,11 @@ class PixivNovelExtractor(PixivExtractor): "x_restrict": 0, }, }), + # embeds + ("https://www.pixiv.net/novel/show.php?id=16422450", { + "options": (("embeds", True),), + "count": 3, + }), ("https://www.pixiv.net/n/19612040"), ) @@ -816,6 +822,17 @@ def transform_tags(work): ratings = {0: "General", 1: "R-18", 2: "R-18G"} meta_user = self.config("metadata") meta_bookmark = self.config("metadata-bookmark") + embeds = self.config("embeds") + + if embeds: + headers = { + "User-Agent" : "Mozilla/5.0", + "App-OS" : None, + "App-OS-Version": None, + "App-Version" : None, + "Referer" : self.root + "/", + "Authorization" : None, + } novels = self.novels() if self.max_posts: @@ -840,6 +857,43 @@ def transform_tags(work): content = self.api.novel_text(novel["id"])["novel_text"] yield Message.Url, "text:" + content, novel + if embeds: + desktop = False + illusts = {} + + for marker in text.extract_iter(content, "[", "]"): + if marker.startswith("[jumpuri:"): + desktop = True + elif marker.startswith("pixivimage:"): + illusts[marker[11:].partition("-")[0]] = None + + if desktop: + novel_id = str(novel["id"]) + url = "{}/novel/show.php?id={}".format( + self.root, novel_id) + data = util.json_loads(text.extr( + self.request(url, headers=headers).text, + "id=\"meta-preload-data\" content='", "'")) + + for image in (data["novel"][novel_id] + ["textEmbeddedImages"]).values(): + url = image.pop("urls")["original"] + novel.update(image) + novel["date_url"] = self._date_from_url(url) + novel["num"] += 1 + novel["suffix"] = "_p{:02}".format(novel["num"]) + text.nameext_from_url(url, novel) + yield Message.Url, url, novel + + if illusts: + novel["_extractor"] = PixivWorkExtractor + novel["date_url"] = None + for illust_id in illusts: + novel["num"] += 1 + novel["suffix"] = "_p{:02}".format(novel["num"]) + url = "{}/artworks/{}".format(self.root, illust_id) + yield Message.Queue, url, novel + def novels(self): return (self.api.novel_detail(self.novel_id),) From c76f0f3a1bbc63b726396e5a92c0f0638565b1ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 23 May 2023 22:15:20 +0200 Subject: [PATCH 103/252] [misskey] update - rename to 'MisskeyFavoriteExtractor' - add 'access-token' option to docs - add test URLs for other instances - simplify 'pattern' --- docs/configuration.rst | 12 ++++++++++-- docs/gallery-dl.conf | 1 + docs/supportedsites.md | 6 +++--- gallery_dl/extractor/misskey.py | 12 +++++++----- 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 6d5ea7c231..7a9c808283 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2059,8 +2059,16 @@ Description Also emit metadata for text-only posts without media content. +extractor.[misskey].access-token +-------------------------------- +Type + ``string`` +Description + Your access token, necessary to fetch favorited notes. + + extractor.[misskey].renotes ----------------------------- +--------------------------- Type ``bool`` Default @@ -2070,7 +2078,7 @@ Description extractor.[misskey].replies ----------------------------- +--------------------------- Type ``bool`` Default diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 92451fda16..bc1985a28c 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -190,6 +190,7 @@ "password": null }, "misskey": { + "access-token": null, "renotes": false, "replies": true }, diff --git a/docs/supportedsites.md b/docs/supportedsites.md index f98508ca67..938e91dac4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1132,19 +1132,19 @@ Consider all sites to be NSFW unless otherwise known. Misskey.io https://misskey.io/ - Images from Notes, User Profiles + Favorites, Images from Notes, User Profiles Lesbian.energy https://lesbian.energy/ - Images from Notes, User Profiles + Favorites, Images from Notes, User Profiles Sushi.ski https://sushi.ski/ - Images from Notes, User Profiles + Favorites, Images from Notes, User Profiles diff --git a/gallery_dl/extractor/misskey.py b/gallery_dl/extractor/misskey.py index 26b7f19483..37efac070f 100644 --- a/gallery_dl/extractor/misskey.py +++ b/gallery_dl/extractor/misskey.py @@ -70,7 +70,7 @@ def notes(self): }, "lesbian.energy": { "root": "https://lesbian.energy", - "pattern": r"lesbian\.energy" + "pattern": r"lesbian\.energy", }, "sushi.ski": { "root": "https://sushi.ski", @@ -154,13 +154,15 @@ def notes(self): return (self.api.notes_show(self.item),) -class MisskeyMyFavoritesExtractor(MisskeyExtractor): - """Extractor for images from favorites""" - subcategory = "favorites" - pattern = BASE_PATTERN + r"(/my/favorites|/api/i/favorites)" +class MisskeyFavoriteExtractor(MisskeyExtractor): + """Extractor for favorited notes""" + subcategory = "favorite" + pattern = BASE_PATTERN + r"/(?:my|api/i)/favorites" test = ( ("https://misskey.io/my/favorites"), ("https://misskey.io/api/i/favorites"), + ("https://lesbian.energy/my/favorites"), + ("https://sushi.ski/my/favorites"), ) def notes(self): From f3cca50b9e37c8350452411635d63b8cb4332ca8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 24 May 2023 12:32:13 +0200 Subject: [PATCH 104/252] [mangadex] update links to API docs --- docs/configuration.rst | 4 ++-- gallery_dl/extractor/mangadex.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index e430735f3b..b0f793fb57 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2037,8 +2037,8 @@ Example Description Additional query parameters to send when fetching manga chapters. - (See `/manga/{id}/feed `_ - and `/user/follows/manga/feed `_) + (See `/manga/{id}/feed `__ + and `/user/follows/manga/feed `__) extractor.mangadex.lang diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 12b8f39235..e111fee353 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -193,7 +193,10 @@ def chapters(self): class MangadexAPI(): - """Interface for the MangaDex API v5""" + """Interface for the MangaDex API v5 + + https://api.mangadex.org/docs/ + """ def __init__(self, extr): self.extractor = extr From 822a77d8466948c07216b537050da28a2597990d Mon Sep 17 00:00:00 2001 From: thatfuckingbird <67429906+thatfuckingbird@users.noreply.github.com> Date: Wed, 24 May 2023 19:06:40 +0200 Subject: [PATCH 105/252] [danbooru] add support for booru.borvar.art instance --- gallery_dl/extractor/danbooru.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 5cfbf5c481..0ce77ad361 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -161,6 +161,10 @@ def _ugoira_frames(self, post): "root": None, "pattern": r"(?:safe.)?aibooru\.online", }, + "booruvar": { + "root": "https://booru.borvar.art", + "pattern": r"booru\.borvar\.art" + }, }) From f8c4c5eef9b0d3f210e8fceed9f2c91cfa893459 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 25 May 2023 13:15:11 +0200 Subject: [PATCH 106/252] [reddit] simplify and add tests --- gallery_dl/extractor/reddit.py | 41 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index e1f1d27ff3..3f09e13e41 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -55,32 +55,26 @@ def items(self): visited.add(submission["id"]) submission["num"] = 0 - url = submission["url"] - if not url: - continue + if "crosspost_parent_list" in submission: + media = submission["crosspost_parent_list"][-1] + else: + media = submission - if url.startswith("https://i.redd.it/"): + url = media["url"] + if url and url.startswith("https://i.redd.it/"): text.nameext_from_url(url, submission) yield Message.Url, url, submission - elif url.startswith("https://www.reddit.com/gallery/"): - gallery_submission = submission - if "crosspost_parent_list" in gallery_submission: - gallery_submission = \ - submission["crosspost_parent_list"][-1] - if "gallery_data" not in gallery_submission: - continue - - gallery = self._extract_gallery(gallery_submission) - - for submission["num"], url in enumerate(gallery, 1): + elif "gallery_data" in media: + for submission["num"], url in enumerate( + self._extract_gallery(media), 1): text.nameext_from_url(url, submission) yield Message.Url, url, submission - elif url.startswith("https://v.redd.it/"): + elif media["is_video"]: if videos: text.nameext_from_url(url, submission) - url = "ytdl:" + self._extract_video(submission) + url = "ytdl:" + self._extract_video(media) yield Message.Url, url, submission elif not submission["is_self"]: @@ -291,14 +285,19 @@ class RedditSubmissionExtractor(RedditExtractor): ("https://www.reddit.com/r/kpopfap/comments/qjj04q/", { "count": 0, }), - ("https://old.reddit.com/r/lavaporn/comments/2a00np/"), - ("https://np.reddit.com/r/lavaporn/comments/2a00np/"), - ("https://m.reddit.com/r/lavaporn/comments/2a00np/"), - ("https://redd.it/2a00np/"), + # user page submission (#2301) ("https://www.reddit.com/user/TheSpiritTree/comments/srilyf/", { "pattern": r"https://i.redd.it/8fpgv17yqlh81.jpg", "count": 1, }), + # cross-posted video (#887, #3586, #3976) + ("https://www.reddit.com/r/kittengifs/comments/12m0b8d", { + "pattern": r"ytdl:https://v\.redd\.it/cvabpjacrvta1", + }), + ("https://old.reddit.com/r/lavaporn/comments/2a00np/"), + ("https://np.reddit.com/r/lavaporn/comments/2a00np/"), + ("https://m.reddit.com/r/lavaporn/comments/2a00np/"), + ("https://redd.it/2a00np/"), ) def __init__(self, match): From d0184fddcf64d86707b8ae63fe4d667fdd6fe4b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 25 May 2023 15:18:49 +0200 Subject: [PATCH 107/252] [twitter] optimize '_extract_twitpic()' - use findall instead of finditer - store URLs in a dict to discard duplicates --- gallery_dl/extractor/twitter.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 4838844c24..c47021ef52 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -78,7 +78,7 @@ def items(self): if self.twitpic: self._find_twitpic = re.compile( - r"https?://(twitpic\.com/\w+)").finditer + r"https?(://twitpic\.com/(?!photos/)\w+)").findall for tweet in self.tweets(): @@ -236,27 +236,24 @@ def _extract_card(self, tweet, files): files.append({"url": url}) def _extract_twitpic(self, tweet, files): - # collect urls - urls = [] + urls = {} + + # collect URLs from entities for url in tweet["entities"].get("urls") or (): url = url["expanded_url"] if "//twitpic.com/" not in url or "/photos/" in url: continue if url.startswith("http:"): url = "https" + url[4:] - urls.append(url) - tget = tweet.get - for match in self._find_twitpic( - tget("full_text") or tget("text") or ""): - urls.append(text.ensure_http_scheme(match.group(1))) + urls[url] = None - # extract actual urls - seen = set() + # collect URLs from text + for url in self._find_twitpic( + tweet.get("full_text") or tweet.get("text") or ""): + urls["https" + url] = None + + # extract actual URLs for url in urls: - if url in seen: - self.log.debug("Skipping %s (previously seen)", url) - continue - seen.add(url) response = self.request(url, fatal=False) if response.status_code >= 400: continue From ad760429b1d160ca453fc14450fdabc4b8568641 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 25 May 2023 16:25:53 +0200 Subject: [PATCH 108/252] [8muses] update --- gallery_dl/extractor/8muses.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py index 56c880e34e..11c2201a50 100644 --- a/gallery_dl/extractor/8muses.py +++ b/gallery_dl/extractor/8muses.py @@ -36,8 +36,10 @@ class _8musesAlbumExtractor(Extractor): "id" : 10467, "title" : "Liar", "path" : "Fakku Comics/mogg/Liar", + "parts" : ["Fakku Comics", "mogg", "Liar"], "private": False, - "url" : str, + "url" : "https://comics.8muses.com/comics" + "/album/Fakku-Comics/mogg/Liar", "parent" : 10464, "views" : int, "likes" : int, @@ -119,11 +121,10 @@ def _make_album(self, album): return { "id" : album["id"], "path" : album["path"], - "parts" : album["permalink"], + "parts" : album["path"].split("/"), "title" : album["name"], "private": album["isPrivate"], - "permalink" : album["permalink"], - "url" : self.root + "/" + album["permalink"], + "url" : self.root + "/comics/album/" + album["permalink"], "parent" : text.parse_int(album["parentId"]), "views" : text.parse_int(album["numberViews"]), "likes" : text.parse_int(album["numberLikes"]), From 3d29c4214283f57faa569113ce874bb32e6c8cf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 25 May 2023 17:04:20 +0200 Subject: [PATCH 109/252] [mangaread] fix 'tags' extraction --- gallery_dl/extractor/mangaread.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py index 4af90e09f1..49d4d7d6ff 100644 --- a/gallery_dl/extractor/mangaread.py +++ b/gallery_dl/extractor/mangaread.py @@ -87,7 +87,7 @@ class MangareadChapterExtractor(MangareadBase, ChapterExtractor): ) def metadata(self, page): - data = {"tags": list(text.extract_iter(page, 'class="">', "<"))} + data = {"tags": list(text.extract_iter(page, "class>", "<"))} info = text.extr(page, '

    ', "

    ") if not info: raise exception.NotFoundError("chapter") @@ -148,11 +148,13 @@ class MangareadMangaExtractor(MangareadBase, MangaExtractor): } }), ("https://www.mangaread.org/manga/doesnotexist", { - "exception": exception.HttpError, + "exception": exception.NotFoundError, }), ) def chapters(self, page): + if 'class="error404' in page: + raise exception.NotFoundError("manga") data = self.metadata(page) result = [] for chapter in text.extract_iter( From 2b1f875ef4341c782ef8ca27dcc67a1b9f205f4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 25 May 2023 18:58:51 +0200 Subject: [PATCH 110/252] [jpgchurch] update to 'jpgfish' --- docs/supportedsites.md | 4 +- gallery_dl/extractor/__init__.py | 2 +- .../extractor/{jpgchurch.py => jpgfish.py} | 44 +++++++++---------- scripts/supportedsites.py | 2 +- 4 files changed, 26 insertions(+), 26 deletions(-) rename gallery_dl/extractor/{jpgchurch.py => jpgfish.py} (80%) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 46d6d92ff2..5835497ec6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -392,8 +392,8 @@ Consider all sites to be NSFW unless otherwise known. - Jpgchurch - https://jpg.church/ + JPG Fish + https://jpg.fishing/ Albums, individual Images, User Profiles diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 0751f7399d..38bef09172 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -66,7 +66,7 @@ "instagram", "issuu", "itaku", - "jpgchurch", + "jpgfish", "kabeuchi", "keenspot", "kemonoparty", diff --git a/gallery_dl/extractor/jpgchurch.py b/gallery_dl/extractor/jpgfish.py similarity index 80% rename from gallery_dl/extractor/jpgchurch.py rename to gallery_dl/extractor/jpgfish.py index 34910d108c..e611bf1c38 100644 --- a/gallery_dl/extractor/jpgchurch.py +++ b/gallery_dl/extractor/jpgfish.py @@ -1,21 +1,21 @@ # -*- coding: utf-8 -*- -# Copyright 2022 Mike Fährmann -# # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. +"""Extractors for https://jpg.fishing/""" + from .common import Extractor, Message from .. import text -BASE_PATTERN = r"(?:https?://)?jpg\.church" +BASE_PATTERN = r"(?:https?://)?jpg\.(?:fishing|church)" -class JpgchurchExtractor(Extractor): - """Base class for Jpgchurch extractors""" - category = "jpgchurch" - root = "https://jpg.church" +class JpgfishExtractor(Extractor): + """Base class for jpgfish extractors""" + category = "jpgfish" + root = "https://jpg.fishing" directory_fmt = ("{category}", "{user}", "{album}",) archive_fmt = "{user}_{filename}" @@ -60,12 +60,12 @@ def _get_albums(self, url): yield image -class JpgchurchImageExtractor(JpgchurchExtractor): - """Extractor for Jpgchurch Images""" +class JpgfishImageExtractor(JpgfishExtractor): + """Extractor for jpgfish Images""" subcategory = "image" pattern = BASE_PATTERN + r"/img/([^/?#]+)" test = ( - ("https://jpg.church/img/funnymeme.LecXGS", { + ("https://jpg.fishing/img/funnymeme.LecXGS", { "pattern": r"^https://[^/]+/.*\.(jpg|png)", }), ("https://jpg.church/img/hannahowo-00457.auCruA", { @@ -75,7 +75,7 @@ class JpgchurchImageExtractor(JpgchurchExtractor): ) def __init__(self, match): - JpgchurchExtractor.__init__(self, match) + JpgfishExtractor.__init__(self, match) self.image = match.group(1) def items(self): @@ -87,12 +87,12 @@ def items(self): yield Message.Url, image["url"], image -class JpgchurchAlbumExtractor(JpgchurchExtractor): - """Extractor for Jpgchurch Albums""" +class JpgfishAlbumExtractor(JpgfishExtractor): + """Extractor for jpgfish Albums""" subcategory = "album" pattern = BASE_PATTERN + r"/a(?:lbum)?/([^/?#]+)(/sub)?" test = ( - ("https://jpg.church/album/CDilP/?sort=date_desc&page=1", { + ("https://jpg.fishing/album/CDilP/?sort=date_desc&page=1", { "count": 2, }), ("https://jpg.church/a/gunggingnsk.N9OOI", { @@ -107,12 +107,12 @@ class JpgchurchAlbumExtractor(JpgchurchExtractor): ) def __init__(self, match): - JpgchurchExtractor.__init__(self, match) + JpgfishExtractor.__init__(self, match) self.album, self.is_sub = match.groups() def items(self): url = "{}/a/{}".format(self.root, self.album) - data = {"_extractor": JpgchurchImageExtractor} + data = {"_extractor": JpgfishImageExtractor} if self.is_sub: url += "/sub" for album in self._get_albums(url): @@ -123,12 +123,12 @@ def items(self): yield Message.Queue, image, data -class JpgchurchUserExtractor(JpgchurchExtractor): - """Extractor for Jpgchurch Users""" +class JpgfishUserExtractor(JpgfishExtractor): + """Extractor for jpgfish Users""" subcategory = "user" pattern = BASE_PATTERN + r"/(?!img|a(?:lbum)?)([^/?#]+)(/albums)?" test = ( - ("https://jpg.church/exearco", { + ("https://jpg.fishing/exearco", { "count": 3, }), ("https://jpg.church/exearco/albums", { @@ -137,17 +137,17 @@ class JpgchurchUserExtractor(JpgchurchExtractor): ) def __init__(self, match): - JpgchurchExtractor.__init__(self, match) + JpgfishExtractor.__init__(self, match) self.user, self.is_album = match.groups() def items(self): url = "{}/{}".format(self.root, self.user) if self.is_album: url += "/albums" - data = {"_extractor": JpgchurchAlbumExtractor} + data = {"_extractor": JpgfishAlbumExtractor} for album in self._get_albums(url): yield Message.Queue, album, data else: - data = {"_extractor": JpgchurchImageExtractor} + data = {"_extractor": JpgfishImageExtractor} for image in self._get_albums(url): yield Message.Queue, image, data diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 1743efb4ca..a4a6406785 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -58,7 +58,7 @@ "imgth" : "imgth", "imgur" : "imgur", "joyreactor" : "JoyReactor", - "jpgchurch" : "Jpgchurch", + "jpgfish" : "JPG Fish", "kabeuchi" : "かべうち", "kemonoparty" : "Kemono", "kireicake" : "Kirei Cake", From 609c4f3e07eb8ba13ef643acafe6ffae1e03c167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 25 May 2023 22:58:42 +0200 Subject: [PATCH 111/252] [jpgfish] simplify and improve --- gallery_dl/extractor/jpgfish.py | 106 +++++++++++++++----------------- 1 file changed, 48 insertions(+), 58 deletions(-) diff --git a/gallery_dl/extractor/jpgfish.py b/gallery_dl/extractor/jpgfish.py index e611bf1c38..cdcf35cb10 100644 --- a/gallery_dl/extractor/jpgfish.py +++ b/gallery_dl/extractor/jpgfish.py @@ -17,72 +17,61 @@ class JpgfishExtractor(Extractor): category = "jpgfish" root = "https://jpg.fishing" directory_fmt = ("{category}", "{user}", "{album}",) - archive_fmt = "{user}_{filename}" - - @staticmethod - def _extract_user(page): - return text.extract(page, 'username: "', '"')[0] - - @staticmethod - def _extract_album(page): - album = text.extract(page, 'Added to ', '')[0] - return album - - def _extract_image(self, url): - page = self.request(url).text - data = { - "url": text.extract( - page, '')[0], - } - text.nameext_from_url(data["url"], data) - data["user"] = self._extract_user(page) - data["album"] = self._extract_album(page) - return data + archive_fmt = "{id}" def _pagination(self, url): - while True: - yield url + while url: page = self.request(url).text - _next = text.extract( - page, '<')[0] - if not _next: - return - url = _next - def _get_albums(self, url): - for url in self._pagination(url): - page = self.request(url).text - page = text.extract_iter( - page, '
    <')[0] class JpgfishImageExtractor(JpgfishExtractor): """Extractor for jpgfish Images""" subcategory = "image" - pattern = BASE_PATTERN + r"/img/([^/?#]+)" + pattern = BASE_PATTERN + r"/img/((?:[^/?#]+\.)?(\w+))" test = ( ("https://jpg.fishing/img/funnymeme.LecXGS", { - "pattern": r"^https://[^/]+/.*\.(jpg|png)", + "pattern": r"https://simp3\.jpg\.church/images/funnymeme\.jpg", + "content": "098e5e9b17ad634358426e0ffd1c93871474d13c", + "keyword": { + "album": "", + "extension": "jpg", + "filename": "funnymeme", + "id": "LecXGS", + "url": "https://simp3.jpg.church/images/funnymeme.jpg", + "user": "exearco", + }, }), - ("https://jpg.church/img/hannahowo-00457.auCruA", { + ("https://jpg.church/img/auCruA", { "pattern": r"https://simp2\.jpg\.church/hannahowo_00457\.jpg", + "keyword": {"album": "401-500"}, }), ("https://jpg.church/img/hannahowo-00424.au64iA"), ) def __init__(self, match): JpgfishExtractor.__init__(self, match) - self.image = match.group(1) + self.path, self.image_id = match.groups() def items(self): - url = "{}/img/{}".format(self.root, self.image) - image = self._extract_image(url) - if not image["album"]: - self.directory_fmt = ("{category}", "{user}",) + url = "{}/img/{}".format(self.root, self.path) + extr = text.extract_from(self.request(url).text) + + image = { + "id" : self.image_id, + "url" : extr('"), ">", "<")[0] or "", + "user" : extr('username: "', '"'), + } + + text.nameext_from_url(image["url"], image) yield Message.Directory, image yield Message.Url, image["url"], image @@ -108,18 +97,19 @@ class JpgfishAlbumExtractor(JpgfishExtractor): def __init__(self, match): JpgfishExtractor.__init__(self, match) - self.album, self.is_sub = match.groups() + self.album, self.sub_albums = match.groups() def items(self): url = "{}/a/{}".format(self.root, self.album) data = {"_extractor": JpgfishImageExtractor} - if self.is_sub: - url += "/sub" - for album in self._get_albums(url): - for image in self._get_albums(album): - yield Message.Queue, image, data + + if self.sub_albums: + albums = self._pagination(url + "/sub") else: - for image in self._get_albums(url): + albums = (url,) + + for album in albums: + for image in self._pagination(album): yield Message.Queue, image, data @@ -138,16 +128,16 @@ class JpgfishUserExtractor(JpgfishExtractor): def __init__(self, match): JpgfishExtractor.__init__(self, match) - self.user, self.is_album = match.groups() + self.user, self.albums = match.groups() def items(self): url = "{}/{}".format(self.root, self.user) - if self.is_album: + + if self.albums: url += "/albums" data = {"_extractor": JpgfishAlbumExtractor} - for album in self._get_albums(url): - yield Message.Queue, album, data else: data = {"_extractor": JpgfishImageExtractor} - for image in self._get_albums(url): - yield Message.Queue, image, data + + for url in self._pagination(url): + yield Message.Queue, url, data From a5d0b03bdee404db28d1cb1dfd2d1685ac13d508 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 25 May 2023 23:11:49 +0200 Subject: [PATCH 112/252] [ytdl] fix crash due to removed 'no_color' attribute https://github.com/yt-dlp/yt-dlp/commit/8417f26b8a819cd7ffcd4e000ca3e45033e670fb --- gallery_dl/ytdl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index eb09b9b77c..0a0bf8645b 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -399,7 +399,7 @@ def metadataparser_actions(f): "playlist_items": opts.playlist_items, "xattr_set_filesize": opts.xattr_set_filesize, "match_filter": match_filter, - "no_color": opts.no_color, + "no_color": getattr(opts, "no_color", None), "ffmpeg_location": opts.ffmpeg_location, "hls_prefer_native": opts.hls_prefer_native, "hls_use_mpegts": opts.hls_use_mpegts, From 9b2326e4e19e9de365357508bc7dc44ba721b4d7 Mon Sep 17 00:00:00 2001 From: chio0hai <94094996+chio0hai@users.noreply.github.com> Date: Fri, 26 May 2023 03:22:23 -0400 Subject: [PATCH 113/252] [lensdump] add lensdump.com extractor --- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/lensdump.py | 152 +++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 gallery_dl/extractor/lensdump.py diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 1f77f94a49..3e47c3ec89 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -81,6 +81,7 @@ "kemonoparty", "khinsider", "komikcast", + "lensdump", "lexica", "lightroom", "lineblog", diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py new file mode 100644 index 0000000000..a88f5a6420 --- /dev/null +++ b/gallery_dl/extractor/lensdump.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- + +"""Extractors for https://lensdump.com/""" + +import json + +from .common import GalleryExtractor, Extractor, Message +from .. import text + + +class LensdumpExtractor(GalleryExtractor): + """Extractor for lensdump.com""" + category = "lensdump" + root = "https://lensdump.com" + + def get_meta_prop(self, page, name): + return text.extr(page, f'property="{name}" content="', '"') + + def nodes(self, page=None): + if page is None: + page = self.request(self.url).text + + # go through all pages starting from the oldest + page_url = text.urljoin(self.root, text.extr( + text.extr(page, ' id="list-most-oldest-link"', '>'), + 'href="', '"')) + while page_url is not None: + if page_url == self.url: + current_page = page + else: + current_page = self.request(page_url).text + + for node in text.extract_iter( + current_page, ' class="list-item ', '>'): + yield node + + # find url of next page + page_url = text.extr( + text.extr(current_page, ' data-pagination="next"', '>'), + 'href="', '"') + if page_url is not None and len(page_url) > 0: + page_url = text.urljoin(self.root, page_url) + else: + page_url = None + + +class LensdumpAlbumExtractor(LensdumpExtractor): + subcategory = "album" + pattern = (r"(?:https?://)?lensdump\.com/" + r"(?:((?!\w+/albums|a/|i/)\w+)|a/(\w+))") + test = ( + ("https://lensdump.com/a/1IhJr", { + "url": "7428cc906e7b291c778d446a11c602b81ba72840", + "keyword": { + "extension": "png", + "name": str, + "num": int, + "title": str, + "url": str, + "width": int, + }, + }), + ) + + def __init__(self, match): + GalleryExtractor.__init__(self, match, match.string) + self.gallery_id = match.group(1) or match.group(2) + + def metadata(self, page): + return { + "gallery_id": self.gallery_id, + "title": text.unescape(text.extr( + page, 'property="og:title" content="', '"').strip()) + } + + def images(self, page): + for node in self.nodes(page): + # get urls and filenames of images in current page + json_data = json.loads(text.unquote( + text.extr(node, 'data-object="', '"'))) + image_id = json_data.get('name') + image_url = json_data.get('url') + image_title = json_data.get('title') + if image_title is not None: + image_title = text.unescape(image_title) + yield (image_url, { + 'id': image_id, + 'url': image_url, + 'title': image_title, + 'name': json_data.get('filename'), + 'filename': image_id, + 'extension': json_data.get('extension'), + 'height': text.parse_int(json_data.get('height')), + 'width': text.parse_int(json_data.get('width')), + }) + + +class LensdumpAlbumsExtractor(LensdumpExtractor): + """Extractor for album list from lensdump.com""" + pattern = r"(?:https?://)?lensdump\.com/\w+/albums" + + def __init__(self, match): + Extractor.__init__(self, match) + + def items(self): + for node in self.nodes(): + album_url = text.urljoin(self.root, text.extr( + node, 'data-url-short="', '"')) + yield Message.Queue, album_url, { + "_extractor": LensdumpAlbumExtractor} + + +class LensdumpImageExtractor(LensdumpExtractor): + """Extractor for individual images on lensdump.com""" + subcategory = "image" + filename_fmt = "{category}_{id}{title:?_//}.{extension}" + directory_fmt = ("{category}",) + archive_fmt = "{id}" + pattern = r"(?:https?://)?lensdump\.com/i/(\w+)" + test = ( + ("https://lensdump.com/i/tyoAyM", { + "url": "ae9933f5f3bd9497bfc34e3e70a0fbef6c562d38", + "content": "1aa749ed2c0cf679ec8e1df60068edaf3875de46", + "keyword": { + "extension": "webp", + "filename": "tyoAyM", + "height": "400", + "id": "tyoAyM", + "title": "MYOBI clovis bookcaseset", + "url": "https://i2.lensdump.com/i/tyoAyM.webp", + "width": "620", + }, + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.key = match.group(1) + + def items(self): + page = self.request(self.url).text + image_url = text.extr(page, 'property="og:image" content="', '"') + data = text.nameext_from_url(image_url) + data.update({ + 'id': self.key, + 'url': image_url, + 'title': self.get_meta_prop(page, "og:title"), + 'height': self.get_meta_prop(page, "image:height"), + 'width': self.get_meta_prop(page, "image:width"), + }) + yield Message.Directory, data + yield Message.Url, image_url, data From 82ba6bfdc0ff9cfd1932520a52751f3a6236dd4f Mon Sep 17 00:00:00 2001 From: chio0hai <94094996+chio0hai@users.noreply.github.com> Date: Fri, 26 May 2023 03:46:12 -0400 Subject: [PATCH 114/252] [lensdump] f-string fix --- gallery_dl/extractor/lensdump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index a88f5a6420..b0545ca346 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -14,7 +14,7 @@ class LensdumpExtractor(GalleryExtractor): root = "https://lensdump.com" def get_meta_prop(self, page, name): - return text.extr(page, f'property="{name}" content="', '"') + return text.extr(page, 'property="{}" content="'.format(name), '"') def nodes(self, page=None): if page is None: From d5300cf381024728d6405815e88383609837f632 Mon Sep 17 00:00:00 2001 From: chio0hai <94094996+chio0hai@users.noreply.github.com> Date: Fri, 26 May 2023 03:51:42 -0400 Subject: [PATCH 115/252] [lensdump] subcategory --- gallery_dl/extractor/lensdump.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index b0545ca346..c35c33ef9b 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -97,6 +97,7 @@ def images(self, page): class LensdumpAlbumsExtractor(LensdumpExtractor): """Extractor for album list from lensdump.com""" + subcategory = "albums" pattern = r"(?:https?://)?lensdump\.com/\w+/albums" def __init__(self, match): From 3516fdae744539bed324086379e85ded30a39f85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 26 May 2023 13:35:02 +0200 Subject: [PATCH 116/252] [kemonoparty] fix kemono and coomer logins using the same cache (#4098) --- gallery_dl/extractor/kemonoparty.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 33e8370ab7..915fbe6879 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -125,10 +125,12 @@ def items(self): def login(self): username, password = self._get_auth_info() if username: - self._update_cookies(self._login_impl(username, password)) + self._update_cookies(self._login_impl( + (username, self.cookiedomain), password)) @cache(maxage=28*24*3600, keyarg=1) def _login_impl(self, username, password): + username = username[0] self.log.info("Logging in as %s", username) url = self.root + "/account/login" From 58f7480d46dec2cc5b3d07789e29bc76a9cfe1f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 26 May 2023 23:39:17 +0200 Subject: [PATCH 117/252] [lensdump] update - update docs/supportedsites.md - add GPL2 header - use BASE_PATTERN - improve LensdumpImageExtractor --- docs/supportedsites.md | 6 +++ gallery_dl/extractor/lensdump.py | 72 ++++++++++++++++++-------------- scripts/supportedsites.py | 3 ++ 3 files changed, 49 insertions(+), 32 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 27bb0bbe7c..995f251974 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -463,6 +463,12 @@ Consider all sites to be NSFW unless otherwise known. Chapters, Manga + + Lensdump + https://lensdump.com/ + Albums, individual Images + + Lexica https://lexica.art/ diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index c35c33ef9b..89906215fc 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -1,21 +1,22 @@ # -*- coding: utf-8 -*- -"""Extractors for https://lensdump.com/""" +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. -import json +"""Extractors for https://lensdump.com/""" from .common import GalleryExtractor, Extractor, Message -from .. import text +from .. import text, util + +BASE_PATTERN = r"(?:https?://)?lensdump\.com" -class LensdumpExtractor(GalleryExtractor): - """Extractor for lensdump.com""" +class LensdumpBase(): + """Base class for lensdump extractors""" category = "lensdump" root = "https://lensdump.com" - def get_meta_prop(self, page, name): - return text.extr(page, 'property="{}" content="'.format(name), '"') - def nodes(self, page=None): if page is None: page = self.request(self.url).text @@ -44,10 +45,9 @@ def nodes(self, page=None): page_url = None -class LensdumpAlbumExtractor(LensdumpExtractor): +class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor): subcategory = "album" - pattern = (r"(?:https?://)?lensdump\.com/" - r"(?:((?!\w+/albums|a/|i/)\w+)|a/(\w+))") + pattern = BASE_PATTERN + r"/(?:((?!\w+/albums|a/|i/)\w+)|a/(\w+))" test = ( ("https://lensdump.com/a/1IhJr", { "url": "7428cc906e7b291c778d446a11c602b81ba72840", @@ -76,7 +76,7 @@ def metadata(self, page): def images(self, page): for node in self.nodes(page): # get urls and filenames of images in current page - json_data = json.loads(text.unquote( + json_data = util.json_loads(text.unquote( text.extr(node, 'data-object="', '"'))) image_id = json_data.get('name') image_url = json_data.get('url') @@ -95,13 +95,11 @@ def images(self, page): }) -class LensdumpAlbumsExtractor(LensdumpExtractor): +class LensdumpAlbumsExtractor(LensdumpBase, Extractor): """Extractor for album list from lensdump.com""" subcategory = "albums" - pattern = r"(?:https?://)?lensdump\.com/\w+/albums" - - def __init__(self, match): - Extractor.__init__(self, match) + pattern = BASE_PATTERN + r"/\w+/albums" + test = ("https://lensdump.com/vstar925/albums",) def items(self): for node in self.nodes(): @@ -111,25 +109,27 @@ def items(self): "_extractor": LensdumpAlbumExtractor} -class LensdumpImageExtractor(LensdumpExtractor): +class LensdumpImageExtractor(LensdumpBase, Extractor): """Extractor for individual images on lensdump.com""" subcategory = "image" filename_fmt = "{category}_{id}{title:?_//}.{extension}" directory_fmt = ("{category}",) archive_fmt = "{id}" - pattern = r"(?:https?://)?lensdump\.com/i/(\w+)" + pattern = BASE_PATTERN + r"/i/(\w+)" test = ( ("https://lensdump.com/i/tyoAyM", { + "pattern": r"https://i\d\.lensdump\.com/i/tyoAyM\.webp", "url": "ae9933f5f3bd9497bfc34e3e70a0fbef6c562d38", "content": "1aa749ed2c0cf679ec8e1df60068edaf3875de46", "keyword": { + "date": "dt:2022-08-01 08:24:28", "extension": "webp", "filename": "tyoAyM", - "height": "400", + "height": 400, "id": "tyoAyM", "title": "MYOBI clovis bookcaseset", "url": "https://i2.lensdump.com/i/tyoAyM.webp", - "width": "620", + "width": 620, }, }), ) @@ -139,15 +139,23 @@ def __init__(self, match): self.key = match.group(1) def items(self): - page = self.request(self.url).text - image_url = text.extr(page, 'property="og:image" content="', '"') - data = text.nameext_from_url(image_url) - data.update({ - 'id': self.key, - 'url': image_url, - 'title': self.get_meta_prop(page, "og:title"), - 'height': self.get_meta_prop(page, "image:height"), - 'width': self.get_meta_prop(page, "image:width"), - }) + url = "{}/i/{}".format(self.root, self.key) + extr = text.extract_from(self.request(url).text) + + data = { + "id" : self.key, + "title" : text.unescape(extr( + 'property="og:title" content="', '"')), + "url" : extr( + 'property="og:image" content="', '"'), + "width" : text.parse_int(extr( + 'property="image:width" content="', '"')), + "height": text.parse_int(extr( + 'property="image:height" content="', '"')), + "date" : text.parse_datetime(extr( + ' Date: Sat, 27 May 2023 15:51:13 +0200 Subject: [PATCH 118/252] [instagram] add 'metadata' option (#3107) --- docs/configuration.rst | 14 ++++++++++++++ gallery_dl/extractor/instagram.py | 4 ++++ 2 files changed, 18 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index b0f793fb57..024bf481e0 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1831,6 +1831,20 @@ Description It is possible to use ``"all"`` instead of listing all values separately. +extractor.instagram.metadata +---------------------------- +Type + ``bool`` +Default + ``false`` +Description + Provide extended ``user`` metadata even when referring to a user by ID, + e.g. ``instagram.com/id:12345678``. + + Note: This metadata is always available when referring to a user by name, + e.g. ``instagram.com/USERNAME``. + + extractor.instagram.order-files ------------------------------- Type diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 1e1de9442b..677cbdda24 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -814,13 +814,17 @@ def user_by_name(self, screen_name): params = {"username": screen_name} return self._call(endpoint, params=params)["data"]["user"] + @memcache(keyarg=1) def user_by_id(self, user_id): endpoint = "/v1/users/{}/info/".format(user_id) return self._call(endpoint)["user"] def user_id(self, screen_name, check_private=True): if screen_name.startswith("id:"): + if self.extractor.config("metadata"): + self.extractor._user = self.user_by_id(screen_name[3:]) return screen_name[3:] + user = self.user_by_name(screen_name) if user is None: raise exception.AuthorizationError( From 5283db1aaed400f2f588f72808a6a903d3eea6db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 27 May 2023 17:08:25 +0200 Subject: [PATCH 119/252] release version 1.25.5 --- CHANGELOG.md | 33 +++++++++++++++++++++++++++++++++ README.rst | 4 ++-- gallery_dl/version.py | 2 +- 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 85c732dd60..405c1174a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,38 @@ # Changelog +## 1.25.5 - 2023-05-27 +### Additions +- [8muses] add `parts` metadata field ([#3329](https://github.com/mikf/gallery-dl/issues/3329)) +- [danbooru] add `date` metadata field ([#4047](https://github.com/mikf/gallery-dl/issues/4047)) +- [e621] add `date` metadata field ([#4047](https://github.com/mikf/gallery-dl/issues/4047)) +- [gofile] add basic password support ([#4056](https://github.com/mikf/gallery-dl/issues/4056)) +- [imagechest] implement API support ([#4065](https://github.com/mikf/gallery-dl/issues/4065)) +- [instagram] add `order-files` option ([#3993](https://github.com/mikf/gallery-dl/issues/3993), [#4017](https://github.com/mikf/gallery-dl/issues/4017)) +- [instagram] add `order-posts` option ([#3993](https://github.com/mikf/gallery-dl/issues/3993), [#4017](https://github.com/mikf/gallery-dl/issues/4017)) +- [instagram] add `metadata` option ([#3107](https://github.com/mikf/gallery-dl/issues/3107)) +- [jpgfish] add `jpg.fishing` extractors ([#2657](https://github.com/mikf/gallery-dl/issues/2657), [#2719](https://github.com/mikf/gallery-dl/issues/2719)) +- [lensdump] add `lensdump.com` extractors ([#2078](https://github.com/mikf/gallery-dl/issues/2078), [#4104](https://github.com/mikf/gallery-dl/issues/4104)) +- [mangaread] add `mangaread.org` extractors ([#2425](https://github.com/mikf/gallery-dl/issues/2425), [#2781](https://github.com/mikf/gallery-dl/issues/2781)) +- [misskey] add `favorite` extractor ([#3950](https://github.com/mikf/gallery-dl/issues/3950)) +- [pixiv] add `novel` support ([#1241](https://github.com/mikf/gallery-dl/issues/1241), [#4044](https://github.com/mikf/gallery-dl/issues/4044)) +- [reddit] support cross-posted media ([#887](https://github.com/mikf/gallery-dl/issues/887), [#3586](https://github.com/mikf/gallery-dl/issues/3586), [#3976](https://github.com/mikf/gallery-dl/issues/3976)) +- [postprocessor:exec] support tilde expansion for `command` +- [formatter] support slicing strings as bytes ([#4087](https://github.com/mikf/gallery-dl/issues/4087)) +### Fixes +- [8muses] fix value of `album[url]` ([#3329](https://github.com/mikf/gallery-dl/issues/3329)) +- [danbooru] refactor pagination logic ([#4002](https://github.com/mikf/gallery-dl/issues/4002)) +- [fanbox] skip invalid posts ([#4088](https://github.com/mikf/gallery-dl/issues/4088)) +- [gofile] automatically fetch `website-token` +- [kemonoparty] fix kemono and coomer logins sharing the same cache ([#4098](https://github.com/mikf/gallery-dl/issues/4098)) +- [newgrounds] add default delay between requests ([#4046](https://github.com/mikf/gallery-dl/issues/4046)) +- [nsfwalbum] detect placeholder images +- [poipiku] extract full `descriptions` ([#4066](https://github.com/mikf/gallery-dl/issues/4066)) +- [tcbscans] update domain to `tcbscans.com` ([#4080](https://github.com/mikf/gallery-dl/issues/4080)) +- [twitter] extract TwitPic URLs in text ([#3792](https://github.com/mikf/gallery-dl/issues/3792), [#3796](https://github.com/mikf/gallery-dl/issues/3796)) +- [weibo] require numeric IDs to have length >= 10 ([#4059](https://github.com/mikf/gallery-dl/issues/4059)) +- [ytdl] fix crash due to removed `no_color` attribute +- [cookies] improve logging behavior ([#4050](https://github.com/mikf/gallery-dl/issues/4050)) + ## 1.25.4 - 2023-05-07 ### Additions - [4chanarchives] add `thread` and `board` extractors ([#4012](https://github.com/mikf/gallery-dl/issues/4012)) diff --git a/README.rst b/README.rst index 36f3ffae72..ba745a85af 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 8d4c98a5f3..3e0290c1fb 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.5-dev" +__version__ = "1.25.5" From b286efefccde5a588e6e7664d7faef0e4f4543ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 28 May 2023 16:30:17 +0200 Subject: [PATCH 120/252] [pixiv] add 'novel-bookmark' extractor (#4111) --- docs/supportedsites.md | 2 +- gallery_dl/extractor/pixiv.py | 49 +++++++++++++++++++++++++++++++---- gallery_dl/version.py | 2 +- scripts/supportedsites.py | 1 + 4 files changed, 47 insertions(+), 7 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 995f251974..ece48ced1a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -664,7 +664,7 @@ Consider all sites to be NSFW unless otherwise known. Pixiv https://www.pixiv.net/ - Artworks, Avatars, Backgrounds, Favorites, Follows, pixiv.me Links, Novels, Novel Series, pixivision, Rankings, Search Results, Series, Sketch, User Profiles, individual Images + Artworks, Avatars, Backgrounds, Favorites, Follows, pixiv.me Links, Novels, Novel Bookmarks, Novel Series, pixivision, Rankings, Search Results, Series, Sketch, User Profiles, individual Images OAuth diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index cdaf595fa7..cc013e6d98 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -168,11 +168,12 @@ def __init__(self, match): def items(self): base = "{}/users/{}/".format(self.root, self.user_id) return self._dispatch_extractors(( - (PixivAvatarExtractor , base + "avatar"), - (PixivBackgroundExtractor, base + "background"), - (PixivArtworksExtractor , base + "artworks"), - (PixivFavoriteExtractor , base + "bookmarks/artworks"), - (PixivNovelUserExtractor , base + "novels"), + (PixivAvatarExtractor , base + "avatar"), + (PixivBackgroundExtractor , base + "background"), + (PixivArtworksExtractor , base + "artworks"), + (PixivFavoriteExtractor , base + "bookmarks/artworks"), + (PixivNovelBookmarkExtractor, base + "bookmarks/novels"), + (PixivNovelUserExtractor , base + "novels"), ), ("artworks",)) @@ -799,6 +800,7 @@ class PixivNovelExtractor(PixivExtractor): "options": (("embeds", True),), "count": 3, }), + # short URL ("https://www.pixiv.net/n/19612040"), ) @@ -927,6 +929,38 @@ def novels(self): return self.api.novel_series(self.novel_id) +class PixivNovelBookmarkExtractor(PixivNovelExtractor): + """Extractor for bookmarked pixiv novels""" + subcategory = "novel-bookmark" + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" + r"/(?:en/)?users/(\d+)/bookmarks/novels" + r"(?:/([^/?#]+))?(?:/?\?([^#]+))?") + test = ( + ("https://www.pixiv.net/en/users/77055466/bookmarks/novels", { + "count": 1, + "content": "7194e8faa876b2b536f185ee271a2b6e46c69089", + }), + ("https://www.pixiv.net/en/users/11/bookmarks/novels/TAG?rest=hide"), + ) + + def __init__(self, match): + PixivNovelExtractor.__init__(self, match) + self.user_id, self.tag, self.query = match.groups() + + def novels(self): + if self.tag: + tag = text.unquote(self.tag) + else: + tag = None + + if text.parse_query(self.query).get("rest") == "hide": + restrict = "private" + else: + restrict = "public" + + return self.api.user_bookmarks_novel(self.user_id, tag, restrict) + + class PixivSketchExtractor(Extractor): """Extractor for user pages on sketch.pixiv.net""" category = "pixiv" @@ -1113,6 +1147,11 @@ def user_bookmarks_illust(self, user_id, tag=None, restrict="public"): params = {"user_id": user_id, "tag": tag, "restrict": restrict} return self._pagination("/v1/user/bookmarks/illust", params) + def user_bookmarks_novel(self, user_id, tag=None, restrict="public"): + """Return novels bookmarked by a user""" + params = {"user_id": user_id, "tag": tag, "restrict": restrict} + return self._pagination("/v1/user/bookmarks/novel", params, "novels") + def user_bookmark_tags_illust(self, user_id, restrict="public"): """Return bookmark tags defined by a user""" params = {"user_id": user_id, "restrict": restrict} diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 3e0290c1fb..5d0a9f0cd2 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.5" +__version__ = "1.25.6-dev" diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index d3ea0c0a71..fb36957b6e 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -218,6 +218,7 @@ }, "pixiv": { "me" : "pixiv.me Links", + "novel-bookmark": "Novel Bookmarks", "novel-series": "Novel Series", "novel-user": "", "pixivision": "pixivision", From ffed7efb6f6d417af444a01b609821c292f7017f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 28 May 2023 18:06:47 +0200 Subject: [PATCH 121/252] [pixiv] use BASE_PATTERN --- gallery_dl/extractor/pixiv.py | 41 +++++++++++++++-------------------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index cc013e6d98..6781a331bf 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -15,6 +15,9 @@ import itertools import hashlib +BASE_PATTERN = r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" +USER_PATTERN = BASE_PATTERN + r"/(?:en/)?users/(\d+)" + class PixivExtractor(Extractor): """Base class for pixiv extractors""" @@ -150,7 +153,7 @@ def metadata(self): class PixivUserExtractor(PixivExtractor): """Extractor for a pixiv user profile""" subcategory = "user" - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:" + pattern = (BASE_PATTERN + r"/(?:" r"(?:en/)?u(?:sers)?/|member\.php\?id=|(?:mypage\.php)?#id=" r")(\d+)(?:$|[?#])") test = ( @@ -180,7 +183,7 @@ def items(self): class PixivArtworksExtractor(PixivExtractor): """Extractor for artworks of a pixiv user""" subcategory = "artworks" - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:" + pattern = (BASE_PATTERN + r"/(?:" r"(?:en/)?users/(\d+)/(?:artworks|illustrations|manga)" r"(?:/([^/?#]+))?/?(?:$|[?#])" r"|member_illust\.php\?id=(\d+)(?:&([^#]+))?)") @@ -241,8 +244,7 @@ class PixivAvatarExtractor(PixivExtractor): subcategory = "avatar" filename_fmt = "avatar{date:?_//%Y-%m-%d}.{extension}" archive_fmt = "avatar_{user[id]}_{date}" - pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net" - r"/(?:en/)?users/(\d+)/avatar") + pattern = USER_PATTERN + r"/avatar" test = ("https://www.pixiv.net/en/users/173530/avatar", { "content": "4e57544480cc2036ea9608103e8f024fa737fe66", }) @@ -262,8 +264,7 @@ class PixivBackgroundExtractor(PixivExtractor): subcategory = "background" filename_fmt = "background{date:?_//%Y-%m-%d}.{extension}" archive_fmt = "background_{user[id]}_{date}" - pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net" - r"/(?:en/)?users/(\d+)/background") + pattern = USER_PATTERN + "/background" test = ("https://www.pixiv.net/en/users/194921/background", { "pattern": r"https://i\.pximg\.net/background/img/2021/01/30/16/12/02" r"/194921_af1f71e557a42f499213d4b9eaccc0f8\.jpg", @@ -377,12 +378,12 @@ def works(self): class PixivFavoriteExtractor(PixivExtractor): - """Extractor for all favorites/bookmarks of a pixiv-user""" + """Extractor for all favorites/bookmarks of a pixiv user""" subcategory = "favorite" directory_fmt = ("{category}", "bookmarks", "{user_bookmark[id]} {user_bookmark[account]}") archive_fmt = "f_{user_bookmark[id]}_{id}{num}.{extension}" - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:(?:en/)?" + pattern = (BASE_PATTERN + r"/(?:(?:en/)?" r"users/(\d+)/(bookmarks/artworks|following)(?:/([^/?#]+))?" r"|bookmark\.php)(?:\?([^#]*))?") test = ( @@ -485,8 +486,7 @@ class PixivRankingExtractor(PixivExtractor): archive_fmt = "r_{ranking[mode]}_{ranking[date]}_{id}{num}.{extension}" directory_fmt = ("{category}", "rankings", "{ranking[mode]}", "{ranking[date]}") - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/ranking\.php(?:\?([^#]*))?") + pattern = BASE_PATTERN + r"/ranking\.php(?:\?([^#]*))?" test = ( ("https://www.pixiv.net/ranking.php?mode=daily&date=20170818"), ("https://www.pixiv.net/ranking.php"), @@ -551,8 +551,7 @@ class PixivSearchExtractor(PixivExtractor): subcategory = "search" archive_fmt = "s_{search[word]}_{id}{num}.{extension}" directory_fmt = ("{category}", "search", "{search[word]}") - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/(?:(?:en/)?tags/([^/?#]+)(?:/[^/?#]+)?/?" + pattern = (BASE_PATTERN + r"/(?:(?:en/)?tags/([^/?#]+)(?:/[^/?#]+)?/?" r"|search\.php)(?:\?([^#]+))?") test = ( ("https://www.pixiv.net/en/tags/Original", { @@ -635,8 +634,7 @@ class PixivFollowExtractor(PixivExtractor): subcategory = "follow" archive_fmt = "F_{user_follow[id]}_{id}{num}.{extension}" directory_fmt = ("{category}", "following") - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/bookmark_new_illust\.php") + pattern = BASE_PATTERN + r"/bookmark_new_illust\.php" test = ( ("https://www.pixiv.net/bookmark_new_illust.php"), ("https://touch.pixiv.net/bookmark_new_illust.php"), @@ -698,8 +696,7 @@ class PixivSeriesExtractor(PixivExtractor): directory_fmt = ("{category}", "{user[id]} {user[account]}", "{series[id]} {series[title]}") filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}" - pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net" - r"/user/(\d+)/series/(\d+)") + pattern = BASE_PATTERN + r"/user/(\d+)/series/(\d+)" test = ("https://www.pixiv.net/user/10509347/series/21859", { "range": "1-10", "count": 10, @@ -756,8 +753,7 @@ class PixivNovelExtractor(PixivExtractor): """Extractor for pixiv novels""" subcategory = "novel" request_interval = 1.0 - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/n(?:ovel/show\.php\?id=|/)(\d+)") + pattern = BASE_PATTERN + r"/n(?:ovel/show\.php\?id=|/)(\d+)" test = ( ("https://www.pixiv.net/novel/show.php?id=19612040", { "count": 1, @@ -903,8 +899,7 @@ def novels(self): class PixivNovelUserExtractor(PixivNovelExtractor): """Extractor for pixiv users' novels""" subcategory = "novel-user" - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/(?:en/)?users/(\d+)/novels") + pattern = USER_PATTERN + r"/novels" test = ("https://www.pixiv.net/en/users/77055466/novels", { "pattern": "^text:", "range": "1-5", @@ -918,8 +913,7 @@ def novels(self): class PixivNovelSeriesExtractor(PixivNovelExtractor): """Extractor for pixiv novel series""" subcategory = "novel-series" - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/novel/series/(\d+)") + pattern = BASE_PATTERN + r"/novel/series/(\d+)" test = ("https://www.pixiv.net/novel/series/10278364", { "count": 4, "content": "b06abed001b3f6ccfb1579699e9a238b46d38ea2", @@ -932,8 +926,7 @@ def novels(self): class PixivNovelBookmarkExtractor(PixivNovelExtractor): """Extractor for bookmarked pixiv novels""" subcategory = "novel-bookmark" - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/(?:en/)?users/(\d+)/bookmarks/novels" + pattern = (USER_PATTERN + r"/bookmarks/novels" r"(?:/([^/?#]+))?(?:/?\?([^#]+))?") test = ( ("https://www.pixiv.net/en/users/77055466/bookmarks/novels", { From 0ad59c92b19d8280a8257b25c04f4b94d15d5963 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 28 May 2023 19:58:20 +0200 Subject: [PATCH 122/252] [blogger] download files from 'lh*.googleusercontent.com' (4070) --- gallery_dl/extractor/blogger.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index eafc8aff9a..3ceada8d5a 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -44,6 +44,7 @@ def items(self): findall_image = re.compile( r'src="(https?://(?:' r'blogger\.googleusercontent\.com/img|' + r'lh\d+\.googleusercontent\.com/|' r'\d+\.bp\.blogspot\.com)/[^"]+)').findall findall_video = re.compile( r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall From 5a6fd8027d93b3026096d25f2989306873a52f2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 29 May 2023 23:03:45 +0200 Subject: [PATCH 123/252] [redgifs] support galleries (#4021) --- gallery_dl/extractor/redgifs.py | 47 +++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index eaaef7d800..267ac60a44 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2022 Mike Fährmann +# Copyright 2020-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -16,7 +16,8 @@ class RedgifsExtractor(Extractor): """Base class for redgifs extractors""" category = "redgifs" - filename_fmt = "{category}_{id}.{extension}" + filename_fmt = \ + "{category}_{gallery:?//[:11]}{num:?_/_/>02}{id}.{extension}" archive_fmt = "{id}" root = "https://www.redgifs.com" @@ -34,16 +35,32 @@ def __init__(self, match): def items(self): metadata = self.metadata() + for gif in self.gifs(): - url = self._process(gif) - if not url: - self.log.warning("Skipping '%s' (format not available)", - gif["id"]) - continue + + gallery = gif.get("gallery") + if gallery: + gifs = self.api.gallery(gallery)["gifs"] + enum = 1 + cnt = len(gifs) + else: + gifs = (gif,) + enum = 0 + cnt = 1 gif.update(metadata) + gif["count"] = cnt yield Message.Directory, gif - yield Message.Url, url, gif + + for num, gif in enumerate(gifs, enum): + url = self._process(gif) + if not url: + self.log.warning( + "Skipping '%s' (format not available)", gif["id"]) + continue + gif["num"] = num + gif["count"] = cnt + yield Message.Url, url, gif def _process(self, gif): gif["_fallback"] = formats = self._formats(gif) @@ -178,6 +195,16 @@ class RedgifsImageExtractor(RedgifsExtractor): r"/FoolishForkedAbyssiniancat\.mp4", "content": "f6e03f1df9a2ff2a74092f53ee7580d2fb943533", }), + # gallery (#4021) + ("https://www.redgifs.com/watch/desertedbaregraywolf", { + "pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.jpg", + "count": 4, + "keyword": { + "num": int, + "count": 4, + "gallery": "187ad979693-1922-fc66-0000-a96fb07b8a5d", + }, + }), ("https://redgifs.com/ifr/FoolishForkedAbyssiniancat"), ("https://i.redgifs.com/i/FoolishForkedAbyssiniancat"), ("https://www.gifdeliverynetwork.com/foolishforkedabyssiniancat"), @@ -207,6 +234,10 @@ def gif(self, gif_id): endpoint = "/v2/gifs/" + gif_id.lower() return self._call(endpoint)["gif"] + def gallery(self, gallery_id): + endpoint = "/v2/gallery/" + gallery_id + return self._call(endpoint) + def user(self, user, order="best"): endpoint = "/v2/users/{}/search".format(user.lower()) params = {"order": order} From bab13402dff4e0968ee848d836da253cdf762e72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 30 May 2023 15:42:31 +0200 Subject: [PATCH 124/252] [redgifs] update 'search' URL pattern (#4115) --- gallery_dl/extractor/redgifs.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 267ac60a44..bfd18b5db4 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -162,21 +162,36 @@ class RedgifsSearchExtractor(RedgifsExtractor): """Extractor for redgifs search results""" subcategory = "search" directory_fmt = ("{category}", "Search", "{search}") - pattern = r"(?:https?://)?(?:\w+\.)?redgifs\.com/browse/?\?([^#]+)" + pattern = (r"(?:https?://)?(?:\w+\.)?redgifs\.com" + r"/(?:gifs/([^/?#]+)|browse)(?:/?\?([^#]+))?") test = ( + ("https://www.redgifs.com/gifs/jav", { + "pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.(mp4|jpg)", + "range": "1-10", + "count": 10, + }), ("https://www.redgifs.com/browse?tags=JAV", { "pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.(mp4|jpg)", "range": "1-10", "count": 10, }), - ("https://v3.redgifs.com/browse?tags=JAV"), + ("https://www.redgifs.com/gifs/jav?order=best&verified=1"), ("https://www.redgifs.com/browse?type=i&verified=y&order=top7"), + ("https://v3.redgifs.com/browse?tags=JAV"), ) + def __init__(self, match): + RedgifsExtractor.__init__(self, match) + self.search, self.query = match.groups() + def metadata(self): - self.params = params = text.parse_query(self.key) - search = params.get("tags") or params.get("order") or "trending" - return {"search": search} + self.params = text.parse_query(self.query) + if self.search: + self.params["tags"] = text.unquote(self.search) + + return {"search": (self.params.get("tags") or + self.params.get("order") or + "trending")} def gifs(self): return self.api.search(self.params) @@ -259,7 +274,6 @@ def collections(self, user): def search(self, params): endpoint = "/v2/gifs/search" params["search_text"] = params.pop("tags", None) - params.pop("needSendGtm", None) return self._pagination(endpoint, params) def _call(self, endpoint, params=None): From 0cf7282fa0379cd4607a06183cd97e6d1fb86387 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 1 Jun 2023 13:07:20 +0200 Subject: [PATCH 125/252] [pixiv] add 'full-series' option for novels (#4111) --- docs/configuration.rst | 11 +++++++++++ gallery_dl/extractor/pixiv.py | 11 ++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 024bf481e0..f8bd38b85f 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2436,6 +2436,17 @@ Description Download images embedded in novels. +extractor.pixiv.novel.full-series +--------------------------------- +Type + ``bool`` +Default + ``false`` +Description + When downloading a novel being part of a series, + download all novels of that series. + + extractor.pixiv.metadata ------------------------ Type diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 6781a331bf..1fc739c78b 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -796,6 +796,11 @@ class PixivNovelExtractor(PixivExtractor): "options": (("embeds", True),), "count": 3, }), + # full series + ("https://www.pixiv.net/novel/show.php?id=19612040", { + "options": (("full-series", True),), + "count": 4, + }), # short URL ("https://www.pixiv.net/n/19612040"), ) @@ -893,7 +898,11 @@ def transform_tags(work): yield Message.Queue, url, novel def novels(self): - return (self.api.novel_detail(self.novel_id),) + novel = self.api.novel_detail(self.novel_id) + if self.config("full-series") and novel["series"]: + self.subcategory = PixivNovelSeriesExtractor.subcategory + return self.api.novel_series(novel["series"]["id"]) + return (novel,) class PixivNovelUserExtractor(PixivNovelExtractor): From 94b6a67666abbd734ef1bed61ade9d09d60283e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 1 Jun 2023 14:51:58 +0200 Subject: [PATCH 126/252] [reddit] fix crash with empty 'crosspost_parent_lists' (#4120) --- gallery_dl/extractor/reddit.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 3f09e13e41..9a57dcfe8b 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -56,7 +56,10 @@ def items(self): submission["num"] = 0 if "crosspost_parent_list" in submission: - media = submission["crosspost_parent_list"][-1] + try: + media = submission["crosspost_parent_list"][-1] + except Exception: + media = submission else: media = submission From 271f23d97179fa9bf0a35d9e358cbffb89099ada Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 1 Jun 2023 15:31:52 +0200 Subject: [PATCH 127/252] [twitter] extract 'conversation_id' metadata (#3839) --- gallery_dl/extractor/twitter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index c47021ef52..cc4667b0bf 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -295,6 +295,8 @@ def _transform_tweet(self, tweet): tget("quoted_by_id_str")), "reply_id" : text.parse_int( tget("in_reply_to_status_id_str")), + "conversation_id": text.parse_int( + tget("conversation_id_str")), "date" : date, "author" : author, "user" : self._user or author, From 45cc7cee1a2e1f9c993e705faeec7e37f3f1be09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 1 Jun 2023 16:03:45 +0200 Subject: [PATCH 128/252] [twitter] better error message for guest searches (#3942) --- gallery_dl/extractor/twitter.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index cc4667b0bf..cfab4d031d 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1418,6 +1418,12 @@ def _call(self, endpoint, params, method="GET", auth=True): self.extractor.wait(until=until, seconds=seconds) continue + if response.status_code == 403 and \ + not self.headers["x-twitter-auth-type"] and \ + endpoint == "/2/search/adaptive.json": + raise exception.AuthorizationError( + "Login required to access search results") + # error try: data = response.json() From 864a654b2598791871100988bd21347a6b6c3e43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 1 Jun 2023 18:53:30 +0200 Subject: [PATCH 129/252] [twitter] update query hashes --- gallery_dl/extractor/twitter.py | 72 ++++++++++++++++----------------- 1 file changed, 34 insertions(+), 38 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index cfab4d031d..4141d65920 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -666,6 +666,7 @@ class TwitterSearchExtractor(TwitterExtractor): subcategory = "search" pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)" test = ("https://twitter.com/search?q=nature", { + "exception": exception.AuthorizationError, "range": "1-40", "count": 40, "archive": False, @@ -1060,7 +1061,7 @@ class TwitterAPI(): def __init__(self, extractor): self.extractor = extractor - self.root = "https://api.twitter.com" + self.root = "https://twitter.com/i/api" self._nsfw_warning = True self._syndication = self.extractor.syndication self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode @@ -1089,7 +1090,6 @@ def __init__(self, extractor): "x-twitter-client-language": "en", "x-twitter-active-user": "yes", "x-csrf-token": csrf_token, - "Origin": "https://twitter.com", "Referer": "https://twitter.com/", } self.params = { @@ -1133,47 +1133,44 @@ def __init__(self, extractor): "enrichments,superFollowMetadata,unmentionInfo,editControl," "collab_control,vibe", } - self.variables = { - "withDownvotePerspective": False, - "withReactionsMetadata": False, - "withReactionsPerspective": False, - } self.features = { - "blue_business_profile_image_shape_enabled": False, - "responsive_web_twitter_blue_verified_badge_is_enabled": True, + "hidden_profile_likes_enabled": False, "responsive_web_graphql_exclude_directive_enabled": True, "verified_phone_label_enabled": False, - "responsive_web_graphql_skip_user_profile_" - "image_extensions_enabled": False, + "subscriptions_verification_info_verified_since_enabled": True, + "highlights_tweets_tab_ui_enabled": True, + "creator_subscriptions_tweet_preview_api_enabled": True, + "responsive_web_graphql_" + "skip_user_profile_image_extensions_enabled": False, "responsive_web_graphql_timeline_navigation_enabled": True, } self.features_pagination = { - "blue_business_profile_image_shape_enabled": False, - "responsive_web_twitter_blue_verified_badge_is_enabled": True, + "rweb_lists_timeline_redesign_enabled": True, "responsive_web_graphql_exclude_directive_enabled": True, "verified_phone_label_enabled": False, + "creator_subscriptions_tweet_preview_api_enabled": True, "responsive_web_graphql_timeline_navigation_enabled": True, "responsive_web_graphql_skip_user_profile_" "image_extensions_enabled": False, "tweetypie_unmention_optimization_enabled": True, - "vibe_api_enabled": True, "responsive_web_edit_tweet_api_enabled": True, "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True, "view_counts_everywhere_api_enabled": True, "longform_notetweets_consumption_enabled": True, "tweet_awards_web_tipping_enabled": False, - "freedom_of_speech_not_reach_fetch_enabled": False, + "freedom_of_speech_not_reach_fetch_enabled": True, "standardized_nudges_misinfo": True, "tweet_with_visibility_results_prefer_gql_" "limited_actions_policy_enabled": False, "interactive_text_enabled": True, "responsive_web_text_conversations_enabled": False, - "longform_notetweets_richtext_consumption_enabled": False, + "longform_notetweets_rich_text_read_enabled": True, + "longform_notetweets_inline_media_enabled": False, "responsive_web_enhance_cards_enabled": False, } def tweet_detail(self, tweet_id): - endpoint = "/graphql/AV_lPTkN6Fc6LgerQpK8Zg/TweetDetail" + endpoint = "/graphql/JlLZj42Ltr2qwjasw-l5lQ/TweetDetail" variables = { "focalTweetId": tweet_id, "referrer": "profile", @@ -1181,9 +1178,7 @@ def tweet_detail(self, tweet_id): "includePromotedContent": True, "withCommunity": True, "withQuickPromoteEligibilityTweetFields": True, - "withBirdwatchNotes": False, - "withSuperFollowsUserFields": True, - "withSuperFollowsTweetFields": True, + "withBirdwatchNotes": True, "withVoice": True, "withV2Timeline": True, } @@ -1191,7 +1186,7 @@ def tweet_detail(self, tweet_id): endpoint, variables, ("threaded_conversation_with_injections_v2",)) def user_tweets(self, screen_name): - endpoint = "/graphql/BeHK76TOCY3P8nO-FWocjA/UserTweets" + endpoint = "/graphql/-AY51QoFpVf-w7TxjQ6lpw/UserTweets" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1203,7 +1198,7 @@ def user_tweets(self, screen_name): return self._pagination_tweets(endpoint, variables) def user_tweets_and_replies(self, screen_name): - endpoint = "/graphql/eZVlZu_1gwb6hMUDXBnZoQ/UserTweetsAndReplies" + endpoint = "/graphql/urrCZMyyIh1FkSFi2cdPUA/UserTweetsAndReplies" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1215,7 +1210,7 @@ def user_tweets_and_replies(self, screen_name): return self._pagination_tweets(endpoint, variables) def user_media(self, screen_name): - endpoint = "/graphql/d_ONZLUHGCsErBCriRsLXg/UserMedia" + endpoint = "/graphql/lo965xQZdN2-eSM1Jc-W_A/UserMedia" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1248,7 +1243,7 @@ def user_media_legacy(self, screen_name): features=False) def user_likes(self, screen_name): - endpoint = "/graphql/fN4-E0MjFJ9Cn7IYConL7g/Likes" + endpoint = "/graphql/6JET1d0iHsIzW0Zjs3OOwQ/Likes" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1261,7 +1256,7 @@ def user_likes(self, screen_name): return self._pagination_tweets(endpoint, variables) def user_bookmarks(self): - endpoint = "/graphql/RV1g3b8n_SGOHwkqKYSCFw/Bookmarks" + endpoint = "/graphql/YNtYqNuki6_oiVwx0uP8mQ/Bookmarks" variables = { "count": 100, } @@ -1272,7 +1267,7 @@ def user_bookmarks(self): features=features) def list_latest_tweets_timeline(self, list_id): - endpoint = "/graphql/5DAiJG3bD77SiWEs4xViBw/ListLatestTweetsTimeline" + endpoint = "/graphql/ZBbXrl37E6za5ml-DIpmgg/ListLatestTweetsTimeline" variables = { "listId": list_id, "count": 100, @@ -1307,11 +1302,10 @@ def live_event(self, event_id): ["twitter_objects"]["live_events"][event_id]) def list_by_rest_id(self, list_id): - endpoint = "/graphql/D0EoyrDcct2MEqC-LnPzFg/ListByRestId" + endpoint = "/graphql/AmCdeFUvlrKAO96yHr-GCg/ListByRestId" params = { "variables": self._json_dumps({ "listId": list_id, - "withSuperFollowsUserFields": True, }), "features": self._json_dumps(self.features), } @@ -1321,7 +1315,7 @@ def list_by_rest_id(self, list_id): raise exception.NotFoundError("list") def list_members(self, list_id): - endpoint = "/graphql/tzsIIbGUH9RyFCVmtO2W2w/ListMembers" + endpoint = "/graphql/a_ZQomd3MMk1crWkeiQBPg/ListMembers" variables = { "listId": list_id, "count": 100, @@ -1331,7 +1325,7 @@ def list_members(self, list_id): endpoint, variables, ("list", "members_timeline", "timeline")) def user_following(self, screen_name): - endpoint = "/graphql/FaBzCqZXuQCb4PhB0RHqHw/Following" + endpoint = "/graphql/JPZiqKjET7_M1r5Tlr8pyA/Following" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1340,18 +1334,20 @@ def user_following(self, screen_name): return self._pagination_users(endpoint, variables) def user_by_rest_id(self, rest_id): - endpoint = "/graphql/S2BkcAyFMG--jef2N6Dgzw/UserByRestId" + endpoint = "/graphql/1YAM811Q8Ry4XyPpJclURQ/UserByRestId" + features = self.features.copy() + features["blue_business_profile_image_shape_enabled"] = True params = { "variables": self._json_dumps({ "userId": rest_id, "withSafetyModeUserFields": True, }), - "features": self._json_dumps(self.features), + "features": self._json_dumps(features), } return self._call(endpoint, params)["data"]["user"]["result"] def user_by_screen_name(self, screen_name): - endpoint = "/graphql/k26ASEiniqy4eXMdknTSoQ/UserByScreenName" + endpoint = "/graphql/XA6F1nJELYg65hxOC2Ekmg/UserByScreenName" params = { "variables": self._json_dumps({ "screen_name": screen_name, @@ -1382,7 +1378,9 @@ def _user_id_by_screen_name(self, screen_name): def _guest_token(self): endpoint = "/1.1/guest/activate.json" self.extractor.log.info("Requesting guest token") - return str(self._call(endpoint, None, "POST", False)["guest_token"]) + return str(self._call( + endpoint, None, "POST", False, "https://api.twitter.com", + )["guest_token"]) def _authenticate_guest(self): guest_token = self._guest_token() @@ -1391,8 +1389,8 @@ def _authenticate_guest(self): self.extractor.session.cookies.set( "gt", guest_token, domain=self.extractor.cookiedomain) - def _call(self, endpoint, params, method="GET", auth=True): - url = self.root + endpoint + def _call(self, endpoint, params, method="GET", auth=True, root=None): + url = (root or self.root) + endpoint while True: if not self.headers["x-twitter-auth-type"] and auth: @@ -1532,7 +1530,6 @@ def _pagination_legacy(self, endpoint, params): def _pagination_tweets(self, endpoint, variables, path=None, stop_tweets=True, features=None): extr = self.extractor - variables.update(self.variables) original_retweets = (extr.retweets == "original") pinned_tweet = extr.pinned @@ -1695,7 +1692,6 @@ def _pagination_tweets(self, endpoint, variables, variables["cursor"] = cursor def _pagination_users(self, endpoint, variables, path=None): - variables.update(self.variables) params = {"variables": None, "features" : self._json_dumps(self.features_pagination)} From 54cf1fa3e75a3836097f2752b164cc49eb353a6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 1 Jun 2023 21:23:31 +0200 Subject: [PATCH 130/252] [twitter] use GraphQL search endpoint (#3942) for guest users; selectable with 'search-endpoint' option. adapted from https://github.com/JustAnotherArchivist/snscrape/commit/9c7b888ffa5f80717595aeed26023a3f9f9020aa --- docs/configuration.rst | 14 +++++++++++ gallery_dl/extractor/twitter.py | 41 +++++++++++++++++++++++++++------ 2 files changed, 48 insertions(+), 7 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index f8bd38b85f..aa58af9c4c 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3155,6 +3155,20 @@ Description will be taken from the original Tweets, not the Retweets. +extractor.twitter.search-endpoint +--------------------------------- +Type + ``string`` +Default + ``"auto"`` +Description + Selects the API endpoint used to retrieve search results. + + * ``"rest"``: Legacy REST endpoint - returns a ``403 Forbidden`` error when not logged in + * ``"graphql"``: New GraphQL endpoint + * ``"auto"``: ``"rest"`` when logged in, ``"graphql"`` otherwise + + extractor.twitter.timeline.strategy ----------------------------------- Type diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 4141d65920..710bde336a 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -666,9 +666,8 @@ class TwitterSearchExtractor(TwitterExtractor): subcategory = "search" pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)" test = ("https://twitter.com/search?q=nature", { - "exception": exception.AuthorizationError, - "range": "1-40", - "count": 40, + "range": "1-20", + "count": 20, "archive": False, }) @@ -1080,6 +1079,10 @@ def __init__(self, extractor): auth_token = cookies.get("auth_token", domain=cookiedomain) + search = extractor.config("search-endpoint") + if search == "graphql" or not auth_token and search in ("auto", None): + self.search_adaptive = self.search_timeline + self.headers = { "Accept": "*/*", "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" @@ -1285,6 +1288,24 @@ def search_adaptive(self, query): params["spelling_corrections"] = "1" return self._pagination_legacy(endpoint, params) + def search_timeline(self, query): + endpoint = "/graphql/7jT5GT59P8IFjgxwqnEdQw/SearchTimeline" + variables = { + "rawQuery": query, + "count": 20, + "product": "Latest", + "withDownvotePerspective": False, + "withReactionsMetadata": False, + "withReactionsPerspective": False, + } + features = self.features_pagination.copy() + features["blue_business_profile_image_shape_enabled"] = False + features["vibe_api_enabled"] = True + return self._pagination_tweets( + endpoint, variables, + ("search_by_raw_query", "search_timeline", "timeline"), + features=features) + def live_event_timeline(self, event_id): endpoint = "/2/live_event/timeline/{}.json".format(event_id) params = self.params.copy() @@ -1553,11 +1574,17 @@ def _pagination_tweets(self, endpoint, variables, instructions = instructions[key] instructions = instructions["instructions"] + cursor = None + entries = None for instr in instructions: - if instr.get("type") == "TimelineAddEntries": + instr_type = instr.get("type") + if instr_type == "TimelineAddEntries": entries = instr["entries"] - break - else: + elif instr_type == "TimelineReplaceEntry": + entry = instr["entry"] + if entry["entryId"].startswith("cursor-bottom-"): + cursor = entry["content"]["value"] + if entries is None: raise KeyError() except LookupError: @@ -1586,7 +1613,7 @@ def _pagination_tweets(self, endpoint, variables, "Unable to retrieve Tweets from this timeline") tweets = [] - tweet = cursor = None + tweet = None if pinned_tweet: pinned_tweet = False From 952c03bc9ef633b6ee4a67d5cf09e286f2813eea Mon Sep 17 00:00:00 2001 From: Bad Manners Date: Fri, 2 Jun 2023 19:53:47 -0300 Subject: [PATCH 131/252] Add fav_id data to FuraffinityFavoriteExtractor An extra field is collected when paginating favorites, and saved to a temporary cache variable. This field is identical for both the old and the new page layouts for FurAffinity, but can only be collected during pagination, hence the cache variable. Other FurAffinity extractors should be unaffected by this change. --- gallery_dl/extractor/furaffinity.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index cc43cec919..51ae094181 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -63,6 +63,9 @@ def items(self): def metadata(self): return None + def _fa_extra_post_data(self, post_id): + return None + def skip(self, num): self.offset += num return num @@ -132,6 +135,10 @@ def _parse_post(self, post_id): data["date"] = text.parse_timestamp(data["filename"].partition(".")[0]) data["description"] = self._process_description(data["_description"]) + extra_data = self._fa_extra_post_data(post_id) + if extra_data: + data.update(extra_data) + return data @staticmethod @@ -159,7 +166,16 @@ def _pagination_favorites(self): while path: page = self.request(self.root + path).text - yield from text.extract_iter(page, 'id="sid-', '"') + extr = text.extract_from(page) + while True: + post_id = extr('id="sid-', '"') + if not post_id: + break + if hasattr(self, '_fa_extra_data_fav_dict'): + self._fa_extra_data_fav_dict[post_id] = { + 'fav_id': text.parse_int(extr('data-fav-id="', '"')), + } + yield post_id path = text.extr(page, 'right" href="', '"') def _pagination_search(self, query): @@ -238,6 +254,7 @@ class FuraffinityFavoriteExtractor(FuraffinityExtractor): subcategory = "favorite" directory_fmt = ("{category}", "{user!l}", "Favorites") pattern = BASE_PATTERN + r"/favorites/([^/?#]+)" + _fa_extra_data_fav_dict = {} test = ("https://www.furaffinity.net/favorites/mirlinthloth/", { "pattern": r"https://d\d?\.f(uraffinity|acdn)\.net" r"/art/[^/]+/\d+/\d+.\w+\.\w+", @@ -248,6 +265,9 @@ class FuraffinityFavoriteExtractor(FuraffinityExtractor): def posts(self): return self._pagination_favorites() + def _fa_extra_post_data(self, post_id): + return self._fa_extra_data_fav_dict.pop(post_id, None) + class FuraffinitySearchExtractor(FuraffinityExtractor): """Extractor for furaffinity search results""" From 92178b369cec269376f7f2e8734bb8c82e702bea Mon Sep 17 00:00:00 2001 From: chio0hai <94094996+chio0hai@users.noreply.github.com> Date: Sat, 3 Jun 2023 00:23:34 -0400 Subject: [PATCH 132/252] [postimage] add gallery support, update image extractor to download original image instead of main image --- gallery_dl/extractor/imagehosts.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index df4ff26556..4457175868 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -295,19 +295,38 @@ class PostimgImageExtractor(ImagehostImageExtractor): """Extractor for single images from postimages.org""" category = "postimg" pattern = (r"(?:https?://)?((?:www\.)?(?:postimg|pixxxels)\.(?:cc|org)" - r"/(?:image/)?([^/?#]+)/?)") + r"/(?!gallery/)(?:image/)?([^/?#]+)/?)") test = ("https://postimg.cc/Wtn2b3hC", { - "url": "0794cfda9b8951a8ac3aa692472484200254ab86", + "url": "72f3c8b1d6c6601a20ad58f35635494b4891a99e", "keyword": "2d05808d04e4e83e33200db83521af06e3147a84", "content": "cfaa8def53ed1a575e0c665c9d6d8cf2aac7a0ee", }) def get_info(self, page): - url , pos = text.extract(page, 'id="main-image" src="', '"') + pos = page.index(' id="download"') + url , pos = text.rextract(page, ' href="', '"', pos) filename, pos = text.extract(page, 'class="imagename">', '<', pos) return url, text.unescape(filename) +class PostimgGalleryExtractor(ImagehostImageExtractor): + """Extractor for images galleries from postimages.org""" + category = "postimg" + subcategory = "gallery" + pattern = (r"(?:https?://)?((?:www\.)?(?:postimg|pixxxels)\.(?:cc|org)" + r"/(?:gallery/)([^/?#]+)/?)") + test = ("https://postimg.cc/gallery/wxpDLgX", { + "pattern": PostimgImageExtractor.pattern, + "count": 22, + }) + + def items(self): + page = self.request(self.page_url).text + data = {"_extractor": PostimgImageExtractor} + for url in text.extract_iter(page, ' class="thumb"> Date: Sat, 3 Jun 2023 00:58:33 -0400 Subject: [PATCH 133/252] [acidimg] fix extractor --- gallery_dl/extractor/imagehosts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index df4ff26556..a731342f5a 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -164,17 +164,17 @@ class AcidimgImageExtractor(ImagehostImageExtractor): pattern = r"(?:https?://)?((?:www\.)?acidimg\.cc/img-([a-z0-9]+)\.html)" test = ("https://acidimg.cc/img-5acb6b9de4640.html", { "url": "f132a630006e8d84f52d59555191ed82b3b64c04", - "keyword": "a8bb9ab8b2f6844071945d31f8c6e04724051f37", + "keyword": "135347ab4345002fc013863c0d9419ba32d98f78", "content": "0c8768055e4e20e7c7259608b67799171b691140", }) params = "simple" encoding = "utf-8" def get_info(self, page): - url, pos = text.extract(page, "", " Date: Sat, 3 Jun 2023 14:56:47 +0200 Subject: [PATCH 134/252] add jpg.pet as alias for jpgfish --- gallery_dl/extractor/jpgfish.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/jpgfish.py b/gallery_dl/extractor/jpgfish.py index cdcf35cb10..e254112b2a 100644 --- a/gallery_dl/extractor/jpgfish.py +++ b/gallery_dl/extractor/jpgfish.py @@ -9,7 +9,7 @@ from .common import Extractor, Message from .. import text -BASE_PATTERN = r"(?:https?://)?jpg\.(?:fishing|church)" +BASE_PATTERN = r"(?:https?://)?jpg\.(?:fishing|church|pet)" class JpgfishExtractor(Extractor): From 0281cc7d08acd6f155ea5fa871a8f6942a2ab421 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 3 Jun 2023 15:40:21 +0200 Subject: [PATCH 135/252] [fanbox] skip 404ed fanbox embeds (#4088) continuation of 4fc9675d --- gallery_dl/extractor/fanbox.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 4ca0852586..373529f447 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -214,9 +214,15 @@ def _process_embed(self, post, embed): # to a proper Fanbox URL url = "https://www.pixiv.net/fanbox/"+content_id # resolve redirect - response = self.request(url, method="HEAD", allow_redirects=False) - url = response.headers["Location"] - final_post["_extractor"] = FanboxPostExtractor + try: + url = self.request(url, method="HEAD", + allow_redirects=False).headers["location"] + except Exception as exc: + url = None + self.log.warning("Unable to extract fanbox embed %s (%s: %s)", + content_id, exc.__class__.__name__, exc) + else: + final_post["_extractor"] = FanboxPostExtractor elif provider == "twitter": url = "https://twitter.com/_/status/"+content_id elif provider == "google_forms": From a90974178de2c9685ee42deed57e22e71756c7ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 3 Jun 2023 15:49:56 +0200 Subject: [PATCH 136/252] [jpgfish] update domain to 'jpg.pet' (#4138) --- docs/supportedsites.md | 2 +- gallery_dl/extractor/jpgfish.py | 23 ++++++++++++++--------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ece48ced1a..b3d4b2ac6f 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -435,7 +435,7 @@ Consider all sites to be NSFW unless otherwise known. JPG Fish - https://jpg.fishing/ + https://jpg.pet/ Albums, individual Images, User Profiles diff --git a/gallery_dl/extractor/jpgfish.py b/gallery_dl/extractor/jpgfish.py index e254112b2a..b8d425a865 100644 --- a/gallery_dl/extractor/jpgfish.py +++ b/gallery_dl/extractor/jpgfish.py @@ -4,18 +4,18 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://jpg.fishing/""" +"""Extractors for https://jpg.pet/""" from .common import Extractor, Message from .. import text -BASE_PATTERN = r"(?:https?://)?jpg\.(?:fishing|church|pet)" +BASE_PATTERN = r"(?:https?://)?jpg\.(?:pet|fish(?:ing)?|church)" class JpgfishExtractor(Extractor): """Base class for jpgfish extractors""" category = "jpgfish" - root = "https://jpg.fishing" + root = "https://jpg.pet" directory_fmt = ("{category}", "{user}", "{album}",) archive_fmt = "{id}" @@ -36,7 +36,7 @@ class JpgfishImageExtractor(JpgfishExtractor): subcategory = "image" pattern = BASE_PATTERN + r"/img/((?:[^/?#]+\.)?(\w+))" test = ( - ("https://jpg.fishing/img/funnymeme.LecXGS", { + ("https://jpg.pet/img/funnymeme.LecXGS", { "pattern": r"https://simp3\.jpg\.church/images/funnymeme\.jpg", "content": "098e5e9b17ad634358426e0ffd1c93871474d13c", "keyword": { @@ -52,7 +52,9 @@ class JpgfishImageExtractor(JpgfishExtractor): "pattern": r"https://simp2\.jpg\.church/hannahowo_00457\.jpg", "keyword": {"album": "401-500"}, }), - ("https://jpg.church/img/hannahowo-00424.au64iA"), + ("https://jpg.fishing/img/funnymeme.LecXGS"), + ("https://jpg.fish/img/funnymeme.LecXGS"), + ("https://jpg.church/img/funnymeme.LecXGS"), ) def __init__(self, match): @@ -81,13 +83,13 @@ class JpgfishAlbumExtractor(JpgfishExtractor): subcategory = "album" pattern = BASE_PATTERN + r"/a(?:lbum)?/([^/?#]+)(/sub)?" test = ( - ("https://jpg.fishing/album/CDilP/?sort=date_desc&page=1", { + ("https://jpg.pet/album/CDilP/?sort=date_desc&page=1", { "count": 2, }), - ("https://jpg.church/a/gunggingnsk.N9OOI", { + ("https://jpg.fishing/a/gunggingnsk.N9OOI", { "count": 114, }), - ("https://jpg.church/a/101-200.aNJ6A/", { + ("https://jpg.fish/a/101-200.aNJ6A/", { "count": 100, }), ("https://jpg.church/a/hannahowo.aNTdH/sub", { @@ -118,12 +120,15 @@ class JpgfishUserExtractor(JpgfishExtractor): subcategory = "user" pattern = BASE_PATTERN + r"/(?!img|a(?:lbum)?)([^/?#]+)(/albums)?" test = ( - ("https://jpg.fishing/exearco", { + ("https://jpg.pet/exearco", { "count": 3, }), ("https://jpg.church/exearco/albums", { "count": 1, }), + ("https://jpg.fishing/exearco"), + ("https://jpg.fish/exearco"), + ("https://jpg.church/exearco"), ) def __init__(self, match): From 4ae925c88fbaf03bab02573aed084703bc34736b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 6 Jun 2023 20:55:03 +0200 Subject: [PATCH 137/252] [kemonoparty] support '.su' TLD (#4139) --- gallery_dl/extractor/kemonoparty.py | 46 +++++++++++++++++++---------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 915fbe6879..5aeefeba98 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -14,7 +14,7 @@ import itertools import re -BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.party" +BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)" USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)" HASH_PATTERN = r"/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})" @@ -29,10 +29,11 @@ class KemonopartyExtractor(Extractor): cookiedomain = ".kemono.party" def __init__(self, match): - if match.group(1) == "coomer": - self.category = "coomerparty" - self.cookiedomain = ".coomer.party" + domain = match.group(1) + tld = match.group(2) + self.category = domain + "party" self.root = text.root_from_url(match.group(0)) + self.cookiedomain = ".{}.{}".format(domain, tld) Extractor.__init__(self, match) self.session.headers["Referer"] = self.root + "/" @@ -40,7 +41,7 @@ def items(self): self._prepare_ddosguard_cookies() self._find_inline = re.compile( - r'src="(?:https?://(?:kemono|coomer)\.party)?(/inline/[^"]+' + r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+' r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall find_hash = re.compile(HASH_PATTERN).match generators = self._build_file_generators(self.config("files")) @@ -224,11 +225,12 @@ class KemonopartyUserExtractor(KemonopartyExtractor): "options": (("max-posts", 25),), "count": "< 100", }), + ("https://kemono.su/subscribestar/user/alcorart"), ("https://kemono.party/subscribestar/user/alcorart"), ) def __init__(self, match): - _, service, user_id, offset = match.groups() + _, _, service, user_id, offset = match.groups() self.subcategory = service KemonopartyExtractor.__init__(self, match) self.api_url = "{}/api/{}/user/{}".format(self.root, service, user_id) @@ -329,13 +331,14 @@ class KemonopartyPostExtractor(KemonopartyExtractor): r"f51c10adc9dabd86e92bd52339f298b9\.txt", "content": "da39a3ee5e6b4b0d3255bfef95601890afd80709", # empty }), + ("https://kemono.su/subscribestar/user/alcorart/post/184330"), ("https://kemono.party/subscribestar/user/alcorart/post/184330"), ("https://www.kemono.party/subscribestar/user/alcorart/post/184330"), ("https://beta.kemono.party/subscribestar/user/alcorart/post/184330"), ) def __init__(self, match): - _, service, user_id, post_id = match.groups() + _, _, service, user_id, post_id = match.groups() self.subcategory = service KemonopartyExtractor.__init__(self, match) self.api_url = "{}/api/{}/user/{}/post/{}".format( @@ -361,9 +364,9 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): "count": 4, "keyword": {"channel_name": "finish-work"}, }), - (("https://kemono.party/discord" + (("https://kemono.su/discord" "/server/256559665620451329/channel/462437519519383555#"), { - "pattern": r"https://kemono\.party/data/(" + "pattern": r"https://kemono\.su/data/(" r"e3/77/e377e3525164559484ace2e64425b0cec1db08.*\.png|" r"51/45/51453640a5e0a4d23fbf57fb85390f9c5ec154.*\.gif)", "keyword": {"hash": "re:e377e3525164559484ace2e64425b0cec1db08" @@ -382,7 +385,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): def __init__(self, match): KemonopartyExtractor.__init__(self, match) - _, self.server, self.channel, self.channel_name = match.groups() + _, _, self.server, self.channel, self.channel_name = match.groups() def items(self): self._prepare_ddosguard_cookies() @@ -457,14 +460,20 @@ def posts(self): class KemonopartyDiscordServerExtractor(KemonopartyExtractor): subcategory = "discord-server" pattern = BASE_PATTERN + r"/discord/server/(\d+)$" - test = ("https://kemono.party/discord/server/488668827274444803", { - "pattern": KemonopartyDiscordExtractor.pattern, - "count": 13, - }) + test = ( + ("https://kemono.party/discord/server/488668827274444803", { + "pattern": KemonopartyDiscordExtractor.pattern, + "count": 13, + }), + ("https://kemono.su/discord/server/488668827274444803", { + "pattern": KemonopartyDiscordExtractor.pattern, + "count": 13, + }), + ) def __init__(self, match): KemonopartyExtractor.__init__(self, match) - self.server = match.group(2) + self.server = match.group(3) def items(self): url = "{}/api/discord/channels/lookup?q={}".format( @@ -493,11 +502,16 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): "url": "ecfccf5f0d50b8d14caa7bbdcf071de5c1e5b90f", "count": 3, }), + ("https://kemono.su/favorites?type=post", { + "pattern": KemonopartyPostExtractor.pattern, + "url": "4be8e84cb384a907a8e7997baaf6287b451783b5", + "count": 3, + }), ) def __init__(self, match): KemonopartyExtractor.__init__(self, match) - self.favorites = (text.parse_query(match.group(2)).get("type") or + self.favorites = (text.parse_query(match.group(3)).get("type") or self.config("favorites") or "artist") From 72e697b8b58966a97f95b5d0ef1a02225b897807 Mon Sep 17 00:00:00 2001 From: Stephan Date: Thu, 8 Jun 2023 14:01:28 +0200 Subject: [PATCH 138/252] Update bunkr.py Support bunkrr.su --- gallery_dl/extractor/bunkr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 7c66fb0efb..fe53ce37fd 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -16,7 +16,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): """Extractor for bunkr.la albums""" category = "bunkr" root = "https://bunkr.la" - pattern = r"(?:https?://)?(?:app\.)?bunkr\.(?:la|[sr]u|is|to)/a/([^/?#]+)" + pattern = r"(?:https?://)?(?:app\.)?bunkrr?\.(?:la|[sr]u|is|to)/a/([^/?#]+)" test = ( ("https://bunkr.la/a/Lktg9Keq", { "pattern": r"https://cdn\.bunkr\.ru/test-テスト-\"&>-QjgneIQv\.png", From a7c066cbacb6ff249e6087431206a1fd62b913f6 Mon Sep 17 00:00:00 2001 From: Stephan Date: Thu, 8 Jun 2023 14:10:25 +0200 Subject: [PATCH 139/252] Update bunkr.py --- gallery_dl/extractor/bunkr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index fe53ce37fd..36091903af 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -16,7 +16,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): """Extractor for bunkr.la albums""" category = "bunkr" root = "https://bunkr.la" - pattern = r"(?:https?://)?(?:app\.)?bunkrr?\.(?:la|[sr]u|is|to)/a/([^/?#]+)" + pattern = r"(?:https?://)?(?:app\.)?bunkr+\.(?:la|[sr]u|is|to)/a/([^/?#]+)" test = ( ("https://bunkr.la/a/Lktg9Keq", { "pattern": r"https://cdn\.bunkr\.ru/test-テスト-\"&>-QjgneIQv\.png", From b9692341fe3e591c881b650e91453c50292dab5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 8 Jun 2023 16:50:09 +0200 Subject: [PATCH 140/252] [jschan] update --- gallery_dl/extractor/jschan.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/jschan.py b/gallery_dl/extractor/jschan.py index cc2c7deeb0..fe758faa33 100644 --- a/gallery_dl/extractor/jschan.py +++ b/gallery_dl/extractor/jschan.py @@ -27,7 +27,7 @@ class JschanThreadExtractor(JschanExtractor): """Extractor for jschan threads""" subcategory = "thread" directory_fmt = ("{category}", "{board}", - "{threadId} {subject[:50]|message[:50]}") + "{threadId} {subject|nomarkup[:50]}") filename_fmt = "{postId}{num:?-//} {filename}.{extension}" archive_fmt = "{board}_{postId}_{num}" pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)\.html" @@ -56,11 +56,11 @@ def items(self): files = post.pop("files", ()) if files: thread.update(post) + thread["count"] = len(files) for num, file in enumerate(files): - file.update(thread) url = self.root + "/file/" + file["filename"] + file.update(thread) file["num"] = num - file["count"] = len(files) file["siteFilename"] = file["filename"] text.nameext_from_url(file["originalFilename"], file) yield Message.Url, url, file @@ -69,10 +69,8 @@ def items(self): class JschanBoardExtractor(JschanExtractor): """Extractor for jschan boards""" subcategory = "board" - pattern = ( - BASE_PATTERN + r"/([^/?#]+)(?:/index\.html|" - r"/catalog\.html|/\d+\.html|/?$)" - ) + pattern = (BASE_PATTERN + r"/([^/?#]+)" + r"(?:/index\.html|/catalog\.html|/\d+\.html|/?$)") test = ( ("https://94chan.org/art/", { "pattern": JschanThreadExtractor.pattern, From e0522ffb3d5082bfc1f7962730b8a225208da599 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 8 Jun 2023 17:01:04 +0200 Subject: [PATCH 141/252] [bunkr] update --- docs/supportedsites.md | 2 +- gallery_dl/extractor/bunkr.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b3d4b2ac6f..760cce948e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -111,7 +111,7 @@ Consider all sites to be NSFW unless otherwise known. Bunkr - https://bunkr.la/ + https://bunkrr.su/ Albums diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 36091903af..166a4f56f8 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -6,19 +6,19 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://bunkr.la/""" +"""Extractors for https://bunkrr.su/""" from .lolisafe import LolisafeAlbumExtractor from .. import text class BunkrAlbumExtractor(LolisafeAlbumExtractor): - """Extractor for bunkr.la albums""" + """Extractor for bunkrr.su albums""" category = "bunkr" - root = "https://bunkr.la" + root = "https://bunkrr.su" pattern = r"(?:https?://)?(?:app\.)?bunkr+\.(?:la|[sr]u|is|to)/a/([^/?#]+)" test = ( - ("https://bunkr.la/a/Lktg9Keq", { + ("https://bunkrr.su/a/Lktg9Keq", { "pattern": r"https://cdn\.bunkr\.ru/test-テスト-\"&>-QjgneIQv\.png", "content": "0c8768055e4e20e7c7259608b67799171b691140", "keyword": { @@ -52,6 +52,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): "num": int, }, }), + ("https://bunkrr.su/a/Lktg9Keq"), ("https://bunkr.la/a/Lktg9Keq"), ("https://bunkr.su/a/Lktg9Keq"), ("https://bunkr.ru/a/Lktg9Keq"), From df106fb58b95c2c9ea760bd1fd67f28a3b8ae266 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 8 Jun 2023 17:21:37 +0200 Subject: [PATCH 142/252] [bunkr] fix video downloads --- gallery_dl/extractor/bunkr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 166a4f56f8..5c8c530fc8 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -71,7 +71,7 @@ def fetch_album(self, album_id): cdn = None files = [] append = files.append - headers = {"Referer": self.root.replace("://", "://stream.", 1) + "/"} + headers = {"Referer": self.root + "/"} pos = page.index('class="grid-images') for url in text.extract_iter(page, ' Date: Thu, 8 Jun 2023 22:18:43 +0200 Subject: [PATCH 143/252] [senmanga] fix and update (#4160) --- gallery_dl/extractor/senmanga.py | 96 ++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 36 deletions(-) diff --git a/gallery_dl/extractor/senmanga.py b/gallery_dl/extractor/senmanga.py index 34177b4120..6d025f45c2 100644 --- a/gallery_dl/extractor/senmanga.py +++ b/gallery_dl/extractor/senmanga.py @@ -1,64 +1,88 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2019 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract manga-chapters from from https://raw.senmanga.com/""" +"""Extractors for https://raw.senmanga.com/""" -from .common import Extractor, Message +from .common import ChapterExtractor from .. import text -class SenmangaChapterExtractor(Extractor): - """Extractor for manga-chapters from raw.senmanga.com""" +class SenmangaChapterExtractor(ChapterExtractor): + """Extractor for manga chapters from raw.senmanga.com""" category = "senmanga" - subcategory = "chapter" - directory_fmt = ("{category}", "{manga}", "{chapter_string}") - filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}" - archive_fmt = "{manga}_{chapter_string}_{page}" - pattern = r"(?:https?://)?raw\.senmanga\.com/([^/]+/[^/]+)" + root = "https://raw.senmanga.com" + pattern = r"(?:https?://)?raw\.senmanga\.com(/[^/?#]+/[^/?#]+)" test = ( - ("http://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", { + ("https://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", { + "pattern": r"https://raw\.senmanga\.com/viewer" + r"/Bokura-wa-Minna-Kawaisou/37A/[12]", "url": "5f95140ff511d8497e2ec08fa7267c6bb231faec", - "keyword": "705d941a150765edb33cd2707074bd703a93788c", "content": "556a16d5ca3441d7a5807b6b5ac06ec458a3e4ba", + "keyword": { + "chapter": "37A", + "count": 2, + "extension": "", + "filename": "re:[12]", + "lang": "ja", + "language": "Japanese", + "manga": "Bokura wa Minna Kawaisou", + "page": int, + }, }), ("http://raw.senmanga.com/Love-Lab/2016-03/1", { + "pattern": r"https://raw\.senmanga\.com/viewer" + r"/Love-Lab/2016-03/\d", "url": "8347b9f00c14b864dd3c19a1f5ae52adb2ef00de", - "keyword": "8a8ab2529ba2edfc83a6b3a8bede1d6c580db7b4", + "keyword": { + "chapter": "2016-03", + "count": 9, + "extension": "", + "filename": r"re:\d", + "manga": "Renai Lab 恋愛ラボ", + }, + }), + ("https://raw.senmanga.com/akabane-honeko-no-bodyguard/1", { + "pattern": r"https://i\d\.wp\.com/kumacdn.club/image-new-2/a" + r"/akabane-honeko-no-bodyguard/chapter-1" + r"/\d+-[0-9a-f]{13}\.jpg", + "keyword": { + "chapter": "1", + "count": 65, + "extension": "jpg", + "filename": r"re:\d+-\w+", + "manga": "Akabane Honeko no Bodyguard", + }, }), ) - root = "https://raw.senmanga.com" def __init__(self, match): - Extractor.__init__(self, match) - part = match.group(1) - self.chapter_url = "{}/{}/".format(self.root, part) - self.img_url = "{}/viewer/{}/".format(self.root, part) - self.session.headers["Referer"] = self.chapter_url + ChapterExtractor.__init__(self, match) + self.session.headers["Referer"] = self.gallery_url - def items(self): - data = self.metadata() - yield Message.Directory, data - for data["page"] in range(1, data["count"]+1): - data["extension"] = None - yield Message.Url, self.img_url + str(data["page"]), data + # select "All pages" viewer + self.session.cookies.set( + "viewer", "1", domain="raw.senmanga.com") - def metadata(self): - """Collect metadata for extractor-job""" - page = self.request(self.chapter_url).text - self.session.cookies.clear() - title, pos = text.extract(page, '', '') - count, pos = text.extract(page, ' of ', '\n', pos) + def metadata(self, page): + title = text.extr(page, "", "") manga, _, chapter = title.partition(" - Chapter ") return { - "manga": text.unescape(manga).replace("-", " "), - "chapter_string": chapter.partition(" - Page ")[0], - "count": text.parse_int(count), - "lang": "jp", - "language": "Japanese", + "manga" : text.unescape(manga).replace("-", " "), + "chapter" : chapter.partition(" - Page ")[0], + "chapter_minor": "", + "lang" : "ja", + "language" : "Japanese", } + + def images(self, page): + return [ + (url, None) + for url in text.extract_iter( + page, ' Date: Thu, 8 Jun 2023 22:22:43 +0200 Subject: [PATCH 144/252] [weibo] prevent fatal exception due to missing video (#4150) --- gallery_dl/extractor/weibo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 2cbfad6e7b..f41b1c05ff 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -123,7 +123,7 @@ def _extract_video(self, info): key=lambda m: m["meta"]["quality_index"]) except Exception: return {"url": (info.get("stream_url_hd") or - info["stream_url"])} + info.get("stream_url") or "")} else: return media["play_info"].copy() From ad882291d3f5a86c3ade08a35bf784c65e8d20b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 9 Jun 2023 16:04:39 +0200 Subject: [PATCH 145/252] [instagram] fix retrieving '/tagged' posts (#4122) reduce number of retrieved posts per API request from 50 to 20 --- gallery_dl/extractor/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 677cbdda24..faeffa6a93 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -864,7 +864,7 @@ def user_saved(self): def user_tagged(self, user_id): endpoint = "/v1/usertags/{}/feed/".format(user_id) - params = {"count": 50} + params = {"count": 20} return self._pagination(endpoint, params) def _call(self, endpoint, **kwargs): From 5e3a1749c8833f9db8a7ef180dd4576f4ccc4d5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 9 Jun 2023 16:30:49 +0200 Subject: [PATCH 146/252] [furaffinity] simplify 'favorite_id' assignment --- gallery_dl/extractor/furaffinity.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 51ae094181..9f5cbbaec7 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -63,9 +63,6 @@ def items(self): def metadata(self): return None - def _fa_extra_post_data(self, post_id): - return None - def skip(self, num): self.offset += num return num @@ -135,10 +132,6 @@ def _parse_post(self, post_id): data["date"] = text.parse_timestamp(data["filename"].partition(".")[0]) data["description"] = self._process_description(data["_description"]) - extra_data = self._fa_extra_post_data(post_id) - if extra_data: - data.update(extra_data) - return data @staticmethod @@ -171,10 +164,7 @@ def _pagination_favorites(self): post_id = extr('id="sid-', '"') if not post_id: break - if hasattr(self, '_fa_extra_data_fav_dict'): - self._fa_extra_data_fav_dict[post_id] = { - 'fav_id': text.parse_int(extr('data-fav-id="', '"')), - } + self._favorite_id = text.parse_int(extr('data-fav-id="', '"')) yield post_id path = text.extr(page, 'right" href="', '"') @@ -254,10 +244,10 @@ class FuraffinityFavoriteExtractor(FuraffinityExtractor): subcategory = "favorite" directory_fmt = ("{category}", "{user!l}", "Favorites") pattern = BASE_PATTERN + r"/favorites/([^/?#]+)" - _fa_extra_data_fav_dict = {} test = ("https://www.furaffinity.net/favorites/mirlinthloth/", { "pattern": r"https://d\d?\.f(uraffinity|acdn)\.net" r"/art/[^/]+/\d+/\d+.\w+\.\w+", + "keyword": {"favorite_id": int}, "range": "45-50", "count": 6, }) @@ -265,8 +255,11 @@ class FuraffinityFavoriteExtractor(FuraffinityExtractor): def posts(self): return self._pagination_favorites() - def _fa_extra_post_data(self, post_id): - return self._fa_extra_data_fav_dict.pop(post_id, None) + def _parse_post(self, post_id): + post = FuraffinityExtractor._parse_post(self, post_id) + if post: + post["favorite_id"] = self._favorite_id + return post class FuraffinitySearchExtractor(FuraffinityExtractor): From 9f1aee3884e504fc279d244293d02d2ad739d6a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 10 Jun 2023 17:57:04 +0200 Subject: [PATCH 147/252] [vipergirls] limit number of requests per second (#4166) --- gallery_dl/extractor/vipergirls.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index 1cebdf752a..549163f759 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -18,6 +18,8 @@ class VipergirlsExtractor(Extractor): """Base class for vipergirls extractors""" category = "vipergirls" root = "https://vipergirls.to" + request_interval = 0.5 + request_interval_min = 0.2 def __init__(self, match): Extractor.__init__(self, match) From 0b34a444e0766a5001991537361555d387dc2ca3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 13 Jun 2023 18:58:35 +0200 Subject: [PATCH 148/252] [pixiv:novel] only detect Pixiv embeds (#4175) --- gallery_dl/extractor/pixiv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 1fc739c78b..861959e456 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -865,7 +865,7 @@ def transform_tags(work): illusts = {} for marker in text.extract_iter(content, "[", "]"): - if marker.startswith("[jumpuri:"): + if marker.startswith("[jumpuri:If you would like to "): desktop = True elif marker.startswith("pixivimage:"): illusts[marker[11:].partition("-")[0]] = None From db20a645c5e6aa00d2b5290bfbc1620a74c0e882 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 13 Jun 2023 20:54:02 +0200 Subject: [PATCH 149/252] [vipergirls] use API endpoints (#4166) --- gallery_dl/extractor/vipergirls.py | 66 +++++++++++++----------------- 1 file changed, 28 insertions(+), 38 deletions(-) diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index 549163f759..5a20c67f44 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -9,7 +9,9 @@ """Extractors for https://vipergirls.to/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util + +from xml.etree import ElementTree BASE_PATTERN = r"(?:https?://)?(?:www\.)?vipergirls\.to" @@ -20,26 +22,21 @@ class VipergirlsExtractor(Extractor): root = "https://vipergirls.to" request_interval = 0.5 request_interval_min = 0.2 + cookiedomain = ".vipergirls.to" + cookienames = ("vg_userid", "vg_password") def __init__(self, match): Extractor.__init__(self, match) self.session.headers["Referer"] = self.root def items(self): - for html in self.posts(): - - pos = html.find('")[2].strip()), - } + for post in self.posts(): + data = post.attrib + data["thread_id"] = self.thread_id yield Message.Directory, data - for href in text.extract_iter(html, '', '') + url = "{}/vr.php?t={}".format(self.root, self.thread_id) + root = ElementTree.fromstring(self.request(url).text) + posts = root.iter("post") - url = text.extr(page, 'str diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index 2ff48c321e..500eaa1915 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -437,6 +437,7 @@ def __getitem__(key): "T": util.datetime_to_timestamp_string, "d": text.parse_timestamp, "U": text.unescape, + "H": lambda s: text.unescape(text.remove_html(s)), "g": text.slugify, "S": util.to_string, "s": str, diff --git a/test/test_formatter.py b/test/test_formatter.py index 1bda9d9c67..0992f4ba40 100644 --- a/test/test_formatter.py +++ b/test/test_formatter.py @@ -28,6 +28,7 @@ class TestFormatter(unittest.TestCase): "l": ["a", "b", "c"], "n": None, "s": " \n\r\tSPACE ", + "h": "

    foo

    & bar

    ", "u": "'< / >'", "t": 1262304000, "dt": datetime.datetime(2010, 1, 1), @@ -47,6 +48,10 @@ def test_conversions(self): self._run_test("{s!t}", "SPACE") self._run_test("{a!U}", self.kwdict["a"]) self._run_test("{u!U}", "'< / >'") + self._run_test("{a!H}", self.kwdict["a"]) + self._run_test("{h!H}", "foo & bar") + self._run_test("{u!H}", "'< / >'") + self._run_test("{n!H}", "") self._run_test("{a!s}", self.kwdict["a"]) self._run_test("{a!r}", "'" + self.kwdict["a"] + "'") self._run_test("{a!a}", "'" + self.kwdict["a"] + "'") @@ -434,10 +439,10 @@ def noarg(): fmt4 = formatter.parse("\fM " + path + ":lengths") self.assertEqual(fmt1.format_map(self.kwdict), "'Title' by Name") - self.assertEqual(fmt2.format_map(self.kwdict), "96") + self.assertEqual(fmt2.format_map(self.kwdict), "126") self.assertEqual(fmt3.format_map(self.kwdict), "'Title' by Name") - self.assertEqual(fmt4.format_map(self.kwdict), "96") + self.assertEqual(fmt4.format_map(self.kwdict), "126") with self.assertRaises(TypeError): self.assertEqual(fmt0.format_map(self.kwdict), "") From 654267a3354c0ce466eb54a4e2b6314ad6a29725 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 15 Jun 2023 13:49:17 +0200 Subject: [PATCH 152/252] [weibo] fix 'json' extension for some videos --- gallery_dl/extractor/weibo.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index f41b1c05ff..805aa536c4 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -72,6 +72,8 @@ def items(self): file["url"] = "https:" + file["url"][5:] if "filename" not in file: text.nameext_from_url(file["url"], file) + if file["extension"] == "json": + file["extension"] = "mp4" file["status"] = status file["num"] = num yield Message.Url, file["url"], file From 6c8bf9a762ce06bc582826336237716fd8d2fca1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 15 Jun 2023 16:24:18 +0200 Subject: [PATCH 153/252] [pornhub] improve redirect handling (#4188) --- gallery_dl/extractor/pornhub.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index f8497c0965..f19e33c37f 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -11,7 +11,6 @@ from .common import Extractor, Message from .. import text, exception - BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?pornhub\.com" @@ -146,10 +145,20 @@ def items(self): data = {"_extractor": PornhubGalleryExtractor} while True: - page = self.request( - url, method="POST", headers=headers, params=params).text - if not page: - return - for gid in text.extract_iter(page, 'id="albumphoto', '"'): + response = self.request( + url, method="POST", headers=headers, params=params, + allow_redirects=False) + + if 300 <= response.status_code < 400: + url = "{}{}/photos/{}/ajax".format( + self.root, response.headers["location"], + self.cat or "public") + continue + + gid = None + for gid in text.extract_iter(response.text, 'id="albumphoto', '"'): yield Message.Queue, self.root + "/album/" + gid, data + if gid is None: + return + params["page"] += 1 From dc7af00014998c16aa7ace1ff2e9e8b3fe317621 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 15 Jun 2023 19:33:59 +0200 Subject: [PATCH 154/252] [fantia] refactor - embed response data as hidden '_data' field (instead of returning/passing 'resp') - split _get_urls_from_post() --- gallery_dl/extractor/fantia.py | 125 ++++++++++++++++++--------------- 1 file changed, 70 insertions(+), 55 deletions(-) diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 13dfeada47..b0bf56ba15 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -31,15 +31,22 @@ def items(self): FantiaExtractor._warning = False for post_id in self.posts(): - full_response, post = self._get_post_data(post_id) - yield Message.Directory, post + post = self._get_post_data(post_id) post["num"] = 0 - for url, url_data in self._get_urls_from_post(full_response, post): - post["num"] += 1 - fname = url_data["content_filename"] or url - text.nameext_from_url(fname, url_data) - url_data["file_url"] = url - yield Message.Url, url, url_data + + for content in self._get_post_contents(post): + post["content_category"] = content["category"] + post["content_title"] = content["title"] + post["content_filename"] = content.get("filename", "") + post["content_id"] = content["id"] + yield Message.Directory, post + + for url in self._get_content_urls(post, content): + text.nameext_from_url( + post["content_filename"] or url, post) + post["file_url"] = url + post["num"] += 1 + yield Message.Url, url, post def posts(self): """Return post IDs""" @@ -71,7 +78,7 @@ def _get_post_data(self, post_id): """Fetch and process post data""" url = self.root+"/api/v1/posts/"+post_id resp = self.request(url, headers=self.headers).json()["post"] - post = { + return { "post_id": resp["id"], "post_url": self.root + "/posts/" + str(resp["id"]), "post_title": resp["title"], @@ -85,55 +92,63 @@ def _get_post_data(self, post_id): "fanclub_user_name": resp["fanclub"]["user"]["name"], "fanclub_name": resp["fanclub"]["name"], "fanclub_url": self.root+"/fanclubs/"+str(resp["fanclub"]["id"]), - "tags": resp["tags"] + "tags": resp["tags"], + "_data": resp, } - return resp, post - def _get_urls_from_post(self, resp, post): + def _get_post_contents(self, post): + contents = post["_data"]["post_contents"] + + try: + url = post["_data"]["thumb"]["original"] + except Exception: + pass + else: + contents.insert(0, { + "id": "thumb", + "title": "thumb", + "category": "thumb", + "download_uri": url, + }) + + return contents + + def _get_content_urls(self, post, content): """Extract individual URL data from the response""" - if "thumb" in resp and resp["thumb"] and "original" in resp["thumb"]: - post["content_filename"] = "" - post["content_category"] = "thumb" - post["file_id"] = "thumb" - yield resp["thumb"]["original"], post - - for content in resp["post_contents"]: - post["content_category"] = content["category"] - post["content_title"] = content["title"] - post["content_filename"] = content.get("filename", "") - post["content_id"] = content["id"] - - if "comment" in content: - post["content_comment"] = content["comment"] - - if "post_content_photos" in content: - for photo in content["post_content_photos"]: - post["file_id"] = photo["id"] - yield photo["url"]["original"], post - - if "download_uri" in content: - post["file_id"] = content["id"] - yield self.root+"/"+content["download_uri"], post - - if content["category"] == "blog" and "comment" in content: - comment_json = util.json_loads(content["comment"]) - ops = comment_json.get("ops", ()) - - # collect blogpost text first - blog_text = "" - for op in ops: - insert = op.get("insert") - if isinstance(insert, str): - blog_text += insert - post["blogpost_text"] = blog_text - - # collect images - for op in ops: - insert = op.get("insert") - if isinstance(insert, dict) and "fantiaImage" in insert: - img = insert["fantiaImage"] - post["file_id"] = img["id"] - yield "https://fantia.jp" + img["original_url"], post + if "comment" in content: + post["content_comment"] = content["comment"] + + if "post_content_photos" in content: + for photo in content["post_content_photos"]: + post["file_id"] = photo["id"] + yield photo["url"]["original"] + + if "download_uri" in content: + post["file_id"] = content["id"] + url = content["download_uri"] + if url[0] == "/": + url = self.root + url + yield url + + if content["category"] == "blog" and "comment" in content: + comment_json = util.json_loads(content["comment"]) + ops = comment_json.get("ops") or () + + # collect blogpost text first + blog_text = "" + for op in ops: + insert = op.get("insert") + if isinstance(insert, str): + blog_text += insert + post["blogpost_text"] = blog_text + + # collect images + for op in ops: + insert = op.get("insert") + if isinstance(insert, dict) and "fantiaImage" in insert: + img = insert["fantiaImage"] + post["file_id"] = img["id"] + yield self.root + img["original_url"] class FantiaCreatorExtractor(FantiaExtractor): From f8452984fa0e1922197a1673c14554e426964828 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 16 Jun 2023 15:40:59 +0200 Subject: [PATCH 155/252] [fantia] emit warning for non-visible contents (#4128) --- gallery_dl/extractor/fantia.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index b0bf56ba15..c4a9b4b841 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -41,6 +41,12 @@ def items(self): post["content_id"] = content["id"] yield Message.Directory, post + if content["visible_status"] != "visible": + self.log.warning( + "Unable to download '%s' files from " + "%s#post-content-id-%s", content["visible_status"], + post["post_url"], content["id"]) + for url in self._get_content_urls(post, content): text.nameext_from_url( post["content_filename"] or url, post) @@ -109,6 +115,7 @@ def _get_post_contents(self, post): "title": "thumb", "category": "thumb", "download_uri": url, + "visible_status": "visible", }) return contents From ef9891ec9d700c198053ffb2c0d475d459055a9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 16 Jun 2023 15:43:07 +0200 Subject: [PATCH 156/252] [fantia] extract 'plan' metadata (#2477, #4128) --- gallery_dl/extractor/fantia.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index c4a9b4b841..35c4cc4517 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -24,6 +24,14 @@ def items(self): "Accept" : "application/json, text/plain, */*", "Referer": self.root, } + _empty_plan = { + "id" : 0, + "price": 0, + "limit": 0, + "name" : "", + "description": "", + "thumb": self.root + "/images/fallback/plan/thumb_default.png", + } if self._warning: if not self._check_cookies(("_session_id",)): @@ -39,6 +47,7 @@ def items(self): post["content_title"] = content["title"] post["content_filename"] = content.get("filename", "") post["content_id"] = content["id"] + post["plan"] = content["plan"] or _empty_plan yield Message.Directory, post if content["visible_status"] != "visible": @@ -116,6 +125,7 @@ def _get_post_contents(self, post): "category": "thumb", "download_uri": url, "visible_status": "visible", + "plan": None, }) return contents From 339fcdb8ad3a1b1ba85f2a77347fc521ec552b45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 17 Jun 2023 13:47:00 +0200 Subject: [PATCH 157/252] [wallhaven] handle '429 Too Many Requests' errors (#4192) - set 1.4s delay between API requests (WH allows 45 requests per minute) - wait and retry on 429 errors --- gallery_dl/extractor/wallhaven.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py index 06f1aab8a7..a0fba3cca9 100644 --- a/gallery_dl/extractor/wallhaven.py +++ b/gallery_dl/extractor/wallhaven.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2022 Mike Fährmann +# Copyright 2018-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,15 +9,16 @@ """Extractors for https://wallhaven.cc/""" from .common import Extractor, Message -from .. import text +from .. import text, exception class WallhavenExtractor(Extractor): """Base class for wallhaven extractors""" category = "wallhaven" + root = "https://wallhaven.cc" filename_fmt = "{category}_{id}_{resolution}.{extension}" archive_fmt = "{id}" - root = "https://wallhaven.cc" + request_interval = 1.4 def __init__(self, match): Extractor.__init__(self, match) @@ -246,8 +247,21 @@ def search(self, params): def _call(self, endpoint, params=None): url = "https://wallhaven.cc/api" + endpoint - return self.extractor.request( - url, headers=self.headers, params=params).json() + + while True: + response = self.extractor.request( + url, params=params, headers=self.headers, fatal=None) + + if response.status_code < 400: + return response.json() + if response.status_code == 429: + self.extractor.wait(seconds=60) + continue + + self.extractor.log.debug("Server response: %s", response.text) + raise exception.StopExtraction( + "API request failed (%s: %s)", + response.status_code, response.reason) def _pagination(self, endpoint, params=None, metadata=None): if params is None: From a673998b1e43822709b01f25482763b54cca81f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 17 Jun 2023 15:27:09 +0200 Subject: [PATCH 158/252] release version 1.25.6 --- CHANGELOG.md | 33 +++++++++++++++++++++++++++++++++ README.rst | 4 ++-- gallery_dl/version.py | 2 +- 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 405c1174a8..429c7eaf92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,38 @@ # Changelog +## 1.25.6 - 2023-06-17 +### Additions +- [blogger] download files from `lh*.googleusercontent.com` ([#4070](https://github.com/mikf/gallery-dl/issues/4070)) +- [fantia] extract `plan` metadata ([#2477](https://github.com/mikf/gallery-dl/issues/2477)) +- [fantia] emit warning for non-visible content sections ([#4128](https://github.com/mikf/gallery-dl/issues/4128)) +- [furaffinity] extract `favorite_id` metadata ([#4133](https://github.com/mikf/gallery-dl/issues/4133)) +- [jschan] add generic extractors for jschan image boards ([#3447](https://github.com/mikf/gallery-dl/issues/3447)) +- [kemonoparty] support `.su` TLDs ([#4139](https://github.com/mikf/gallery-dl/issues/4139)) +- [pixiv:novel] add `novel-bookmark` extractor ([#4111](https://github.com/mikf/gallery-dl/issues/4111)) +- [pixiv:novel] add `full-series` option ([#4111](https://github.com/mikf/gallery-dl/issues/4111)) +- [postimage] add gallery support, update image extractor ([#3115](https://github.com/mikf/gallery-dl/issues/3115), [#4134](https://github.com/mikf/gallery-dl/issues/4134)) +- [redgifs] support galleries ([#4021](https://github.com/mikf/gallery-dl/issues/4021)) +- [twitter] extract `conversation_id` metadata ([#3839](https://github.com/mikf/gallery-dl/issues/3839)) +- [vipergirls] add login support ([#4166](https://github.com/mikf/gallery-dl/issues/4166)) +- [vipergirls] use API endpoints ([#4166](https://github.com/mikf/gallery-dl/issues/4166)) +- [formatter] implement `H` conversion ([#4164](https://github.com/mikf/gallery-dl/issues/4164)) +### Fixes +- [acidimg] fix extraction ([#4136](https://github.com/mikf/gallery-dl/issues/4136)) +- [bunkr] update domain to bunkrr.su ([#4159](https://github.com/mikf/gallery-dl/issues/4159), [#4189](https://github.com/mikf/gallery-dl/issues/4189)) +- [bunkr] fix video downloads +- [fanbox] prevent exception due to missing embeds ([#4088](https://github.com/mikf/gallery-dl/issues/4088)) +- [instagram] fix retrieving `/tagged` posts ([#4122](https://github.com/mikf/gallery-dl/issues/4122)) +- [jpgfish] update domain to `jpg.pet` ([#4138](https://github.com/mikf/gallery-dl/issues/4138)) +- [pixiv:novel] fix error with embeds extraction ([#4175](https://github.com/mikf/gallery-dl/issues/4175)) +- [pornhub] improve redirect handling ([#4188](https://github.com/mikf/gallery-dl/issues/4188)) +- [reddit] fix crash due to empty `crosspost_parent_lists` ([#4120](https://github.com/mikf/gallery-dl/issues/4120), [#4172](https://github.com/mikf/gallery-dl/issues/4172)) +- [redgifs] update `search` URL pattern ([#4115](https://github.com/mikf/gallery-dl/issues/4115), [#4185](https://github.com/mikf/gallery-dl/issues/4185)) +- [senmanga] fix and update ([#4160](https://github.com/mikf/gallery-dl/issues/4160)) +- [twitter] use GraphQL API search endpoint ([#3942](https://github.com/mikf/gallery-dl/issues/3942)) +- [wallhaven] improve HTTP error handling ([#4192](https://github.com/mikf/gallery-dl/issues/4192)) +- [weibo] prevent fatal exception due to missing video data ([#4150](https://github.com/mikf/gallery-dl/issues/4150)) +- [weibo] fix `.json` extension for some videos + ## 1.25.5 - 2023-05-27 ### Additions - [8muses] add `parts` metadata field ([#3329](https://github.com/mikf/gallery-dl/issues/3329)) diff --git a/README.rst b/README.rst index ba745a85af..44cbfb383a 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 5d0a9f0cd2..09b8612033 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.6-dev" +__version__ = "1.25.6" From 92d98697b2faafc7f8fedad14883de395d1b9cec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 18 Jun 2023 20:00:34 +0200 Subject: [PATCH 159/252] [wallhaven] update API error message --- gallery_dl/extractor/wallhaven.py | 2 +- gallery_dl/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py index a0fba3cca9..0ba0d910d8 100644 --- a/gallery_dl/extractor/wallhaven.py +++ b/gallery_dl/extractor/wallhaven.py @@ -260,7 +260,7 @@ def _call(self, endpoint, params=None): self.extractor.log.debug("Server response: %s", response.text) raise exception.StopExtraction( - "API request failed (%s: %s)", + "API request failed (%s %s)", response.status_code, response.reason) def _pagination(self, endpoint, params=None, metadata=None): diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 09b8612033..e25ba45637 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.6" +__version__ = "1.25.7-dev" From 2052e7ce59eadaa705a7bdfd803858b433585103 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 18 Jun 2023 20:01:33 +0200 Subject: [PATCH 160/252] [hentaifox] fix titles containing '@' (#4201) --- gallery_dl/extractor/hentaifox.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py index ed8576f1e1..a1e681d1d7 100644 --- a/gallery_dl/extractor/hentaifox.py +++ b/gallery_dl/extractor/hentaifox.py @@ -45,6 +45,15 @@ class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor): "type": "doujinshi", }, }), + # email-protected title (#4201) + ("https://hentaifox.com/gallery/35261/", { + "keyword": { + "gallery_id": 35261, + "title": "ManageM@ster!", + "artist": ["haritama hiroki"], + "group": ["studio n.ball"], + }, + }), ) def __init__(self, match): @@ -65,13 +74,14 @@ def metadata(self, page): return { "gallery_id": text.parse_int(self.gallery_id), - "title" : text.unescape(extr("

    ", "

    ")), "parody" : split(extr(">Parodies:" , "")), "characters": split(extr(">Characters:", "")), "tags" : split(extr(">Tags:" , "")), "artist" : split(extr(">Artists:" , "")), "group" : split(extr(">Groups:" , "")), "type" : text.remove_html(extr(">Category:", " Date: Mon, 19 Jun 2023 15:01:53 +0200 Subject: [PATCH 161/252] [docs] add novel-related keys to 'pixiv.include' (#4111) --- docs/configuration.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index a98ea518f1..f7a1bbcc48 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2411,7 +2411,12 @@ Description when processing a user profile. Possible values are - ``"artworks"``, ``"avatar"``, ``"background"``, ``"favorite"``. + ``"artworks"``, + ``"avatar"``, + ``"background"``, + ``"favorite"``, + ``"novel-user"``, + ``"novel-bookmark"``. It is possible to use ``"all"`` instead of listing all values separately. From 3e7ac5ec3a5dab5b4a6843fe80b3eacc49c7e401 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 19 Jun 2023 17:53:40 +0200 Subject: [PATCH 162/252] check file mode bits ensure every file in ./gallery_dl has mode 644 --- .github/workflows/tests.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a72761d4c2..62691ac6dd 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,6 +20,10 @@ jobs: steps: - uses: actions/checkout@v3 + - name: Check file mode bits + run: | + [ "$(find ./gallery_dl -type f -not -perm 644)" ] && exit 1 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: From ec64cbefeb618a8361a63675a3892077f5e70e25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 21 Jun 2023 23:03:05 +0200 Subject: [PATCH 163/252] [postprocessor:exec] add tests --- test/test_postprocessor.py | 70 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index ac89b55cb9..0f9d2ab0d8 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -164,6 +164,76 @@ def test_classify_custom(self): mkdirs.assert_called_once_with(path, exist_ok=True) +class ExecTest(BasePostprocessorTest): + + def test_command_string(self): + self._create({ + "command": "echo {} && rm {};", + }) + + with patch("subprocess.Popen") as p: + i = Mock() + i.wait.return_value = 0 + p.return_value = i + self._trigger(("after",)) + + p.assert_called_once_with( + "echo {0} && rm {0};".format(self.pathfmt.realpath), shell=True) + i.wait.assert_called_once_with() + + def test_command_list(self): + self._create({ + "command": ["~/script.sh", "{category}", + "\fF {_directory.upper()}"], + }) + + with patch("subprocess.Popen") as p: + i = Mock() + i.wait.return_value = 0 + p.return_value = i + self._trigger(("after",)) + + p.assert_called_once_with( + [ + os.path.expanduser("~/script.sh"), + self.pathfmt.kwdict["category"], + self.pathfmt.realdirectory.upper(), + ], + shell=False, + ) + + def test_command_returncode(self): + self._create({ + "command": "echo {}", + }) + + with patch("subprocess.Popen") as p: + i = Mock() + i.wait.return_value = 123 + p.return_value = i + + with self.assertLogs() as log: + self._trigger(("after",)) + + msg = ("WARNING:postprocessor.exec:'echo {}' returned with " + "non-zero exit status (123)".format(self.pathfmt.realpath)) + self.assertEqual(log.output[0], msg) + + def test_async(self): + self._create({ + "async" : True, + "command": "echo {}", + }) + + with patch("subprocess.Popen") as p: + i = Mock() + p.return_value = i + self._trigger(("after",)) + + self.assertTrue(p.called) + self.assertFalse(i.wait.called) + + class MetadataTest(BasePostprocessorTest): def test_metadata_default(self): From 068aa26c3eaca1bac57f6dc0352bdc26585876f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 22 Jun 2023 11:54:33 +0200 Subject: [PATCH 164/252] [gelbooru_v01] fix '--range' (#4167) --- gallery_dl/extractor/gelbooru_v01.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py index 9c19664e5e..55fff72ad9 100644 --- a/gallery_dl/extractor/gelbooru_v01.py +++ b/gallery_dl/extractor/gelbooru_v01.py @@ -42,6 +42,10 @@ def _parse_post(self, post_id): return post + def skip(self, num): + self.page_start += num + return num + def _pagination(self, url, begin, end): pid = self.page_start From 8357acf3592f079cdb72d2af3eb2b352f0067a04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 23 Jun 2023 15:17:42 +0200 Subject: [PATCH 165/252] [gelbooru_v01] replace 'extract_all()' with 'extract_from()' It's even slightly faster, especially on Python before 3.11 --- gallery_dl/extractor/gelbooru_v01.py | 37 ++++++++++++++-------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py index 55fff72ad9..c4f32a4b0f 100644 --- a/gallery_dl/extractor/gelbooru_v01.py +++ b/gallery_dl/extractor/gelbooru_v01.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -19,24 +19,23 @@ class GelbooruV01Extractor(booru.BooruExtractor): def _parse_post(self, post_id): url = "{}/index.php?page=post&s=view&id={}".format( self.root, post_id) - page = self.request(url).text - - post = text.extract_all(page, ( - ("created_at", 'Posted: ', ' <'), - ("uploader" , 'By: ', ' <'), - ("width" , 'Size: ', 'x'), - ("height" , '', ' <'), - ("source" , 'Source: ', '<'), - ))[0] - - post["id"] = post_id + extr = text.extract_from(self.request(url).text) + + post = { + "id" : post_id, + "created_at": extr('Posted: ', ' <'), + "uploader" : extr('By: ', ' <'), + "width" : extr('Size: ', 'x'), + "height" : extr('', ' <'), + "source" : extr('Source: ', '<')), + } + post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0] - post["rating"] = (post["rating"] or "?")[0].lower() - post["tags"] = text.unescape(post["tags"]) post["date"] = text.parse_datetime( post["created_at"], "%Y-%m-%d %H:%M:%S") @@ -186,7 +185,7 @@ class GelbooruV01PostExtractor(GelbooruV01Extractor): "md5": "2aaa0438d58fc7baa75a53b4a9621bb89a9d3fdb", "rating": "s", "score": str, - "source": None, + "source": "", "tags": "blush dress green_eyes green_hair hatsune_miku " "long_hair twintails vocaloid", "uploader": "Honochi31", From b6c959744dd430724e2904d991257b5ce498656b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 23 Jun 2023 21:59:51 +0200 Subject: [PATCH 166/252] [furaffinity] improve 'description' HTML (#4224) - ignore header - include footer and closing
    if present --- gallery_dl/extractor/furaffinity.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 9f5cbbaec7..ec9cd94091 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2022 Mike Fährmann +# Copyright 2020-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -98,7 +98,9 @@ def _parse_post(self, post_id): 'class="tags-row">', '')) data["title"] = text.unescape(extr("

    ", "

    ")) data["artist"] = extr("", "<") - data["_description"] = extr('class="section-body">', '
    ') + data["_description"] = extr( + 'class="submission-description user-submitted-links">', + '
    ') data["views"] = pi(rh(extr('class="views">', ''))) data["favorites"] = pi(rh(extr('class="favorites">', ''))) data["comments"] = pi(rh(extr('class="comments">', ''))) @@ -125,7 +127,9 @@ def _parse_post(self, post_id): data["tags"] = text.split_html(extr( 'id="keywords">', ''))[::2] data["rating"] = extr('', ' ')
-            data[", "") + data["_description"] = extr( + '', ' ') data["artist_url"] = data["artist"].replace("_", "").lower() data["user"] = self.user or data["artist_url"] From c1cce4a80bc19cf7de7b4de11186c21a7111b055 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 24 Jun 2023 20:49:00 +0200 Subject: [PATCH 167/252] [twitter] extend 'conversations' option (#4211) --- docs/configuration.rst | 10 +++++++--- gallery_dl/extractor/twitter.py | 9 ++++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index f7a1bbcc48..8a0b991bfb 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3002,15 +3002,19 @@ Description extractor.twitter.conversations ------------------------------- Type - ``bool`` + * ``bool`` + * ``string`` Default ``false`` Description For input URLs pointing to a single Tweet, e.g. `https://twitter.com/i/web/status/`, fetch media from all Tweets and replies in this `conversation - `__ - or thread. + `__. + + If this option is equal to ``"accessible"``, + only download from conversation Tweets + if the given initial Tweet is accessible. extractor.twitter.csrf diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 710bde336a..10db974405 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -919,7 +919,9 @@ def __init__(self, match): self.tweet_id = match.group(2) def tweets(self): - if self.config("conversations", False): + conversations = self.config("conversations") + if conversations: + self._accessible = (conversations == "accessible") return self._tweets_conversation(self.tweet_id) else: return self._tweets_single(self.tweet_id) @@ -950,6 +952,11 @@ def _tweets_conversation(self, tweet_id): tweet.get("_retweet_id_str") == tweet_id: self._assign_user(tweet["core"]["user_results"]["result"]) break + else: + # initial Tweet not accessible + if self._accessible: + return () + return buffer return itertools.chain(buffer, tweets) From 23469837cd58a5316843ab9efe2c9549d5e58b9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 25 Jun 2023 00:48:40 +0200 Subject: [PATCH 168/252] attempt to fix file permission tests --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 62691ac6dd..36ab95340b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,9 +20,9 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Check file mode bits + - name: Check file permissions run: | - [ "$(find ./gallery_dl -type f -not -perm 644)" ] && exit 1 + if [[ "$(find ./gallery_dl -type f -not -perm 644)" ]]; then exit 1; fi - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 From 25c5a6ffcb915f004d6855a50d4d45da4129b8ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 25 Jun 2023 14:01:26 +0200 Subject: [PATCH 169/252] no f-strings --- test/test_postprocessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index 0f9d2ab0d8..554a51e82d 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -184,7 +184,7 @@ def test_command_string(self): def test_command_list(self): self._create({ "command": ["~/script.sh", "{category}", - "\fF {_directory.upper()}"], + "\fE _directory.upper()"], }) with patch("subprocess.Popen") as p: From ccbc1a1d55135d598f32990dd98e58a14a77023f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 26 Jun 2023 16:49:48 +0200 Subject: [PATCH 170/252] [flickr] add 'metadata' option (#4227) --- docs/configuration.rst | 21 +++++++++++++++++++++ gallery_dl/extractor/flickr.py | 16 +++++++++++++--- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 8a0b991bfb..841f51cc58 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1541,6 +1541,27 @@ Description from `linking your Flickr account to gallery-dl `__. +extractor.flickr.metadata +------------------------- +Type + * ``bool`` + * ``string`` + * ``list`` of ``strings`` +Default + ``false`` +Example + * ``license,last_update,machine_tags`` + * ``["license", "last_update", "machine_tags"]`` +Description + Extract additional metadata + (license, date_taken, original_format, last_update, geo, machine_tags, o_dims) + + It is possible to specify a custom list of metadata includes. + See `the extras parameter `__ + in `Flickr API docs `__ + for possible field names. + + extractor.flickr.videos ----------------------- Type diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index e85d68ac6f..d7df3d7f4b 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -451,9 +451,19 @@ def _call(self, method, params): return data def _pagination(self, method, params, key="photos"): - params["extras"] = ("description,date_upload,tags,views,media," - "path_alias,owner_name,") - params["extras"] += ",".join("url_" + fmt[0] for fmt in self.formats) + extras = ("description,date_upload,tags,views,media," + "path_alias,owner_name,") + includes = self.extractor.config("metadata") + if includes: + if isinstance(includes, (list, tuple)): + includes = ",".join(includes) + elif not isinstance(includes, str): + includes = ("license,date_taken,original_format,last_update," + "geo,machine_tags,o_dims") + extras = extras + includes + "," + extras += ",".join("url_" + fmt[0] for fmt in self.formats) + + params["extras"] = extras params["page"] = 1 while True: From 260ff55e194653f48175e2b425f4d88069eb4236 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 27 Jun 2023 13:45:51 +0200 Subject: [PATCH 171/252] [senmanga] ensure download URLs have a scheme (#4235) --- gallery_dl/extractor/senmanga.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/senmanga.py b/gallery_dl/extractor/senmanga.py index 6d025f45c2..92c9d2cbb5 100644 --- a/gallery_dl/extractor/senmanga.py +++ b/gallery_dl/extractor/senmanga.py @@ -58,6 +58,12 @@ class SenmangaChapterExtractor(ChapterExtractor): "manga": "Akabane Honeko no Bodyguard", }, }), + # no http scheme () + ("https://raw.senmanga.com/amama-cinderella/3", { + "pattern": r"^https://kumacdn.club/image-new-2/a/amama-cinderella" + r"/chapter-3/.+\.jpg", + "count": 30, + }), ) def __init__(self, match): @@ -82,7 +88,7 @@ def metadata(self, page): def images(self, page): return [ - (url, None) + (text.ensure_http_scheme(url), None) for url in text.extract_iter( page, ' Date: Tue, 27 Jun 2023 21:45:44 +0200 Subject: [PATCH 172/252] [poipiku] improve error detection (#4206) --- gallery_dl/extractor/poipiku.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py index 14c25c4492..e3bb512b60 100644 --- a/gallery_dl/extractor/poipiku.py +++ b/gallery_dl/extractor/poipiku.py @@ -76,11 +76,12 @@ def items(self): "MD" : "0", "TWF": "-1", } - page = self.request( - url, method="POST", headers=headers, data=data).json()["html"] + resp = self.request( + url, method="POST", headers=headers, data=data).json() - if page.startswith(("You need to", "Password is incorrect")): - self.log.warning("'%s'", page) + page = resp["html"] + if (resp.get("result_num") or 0) < 0: + self.log.warning("'%s'", page.replace("
    ", " ")) for thumb in text.extract_iter( page, 'class="IllustItemThumbImg" src="', '"'): From 10786c657e82eebf40c86e82400a8c69fc72fd60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 29 Jun 2023 22:31:34 +0200 Subject: [PATCH 173/252] [mangapark] update and fix 'chapter' extractor (#3969) --- gallery_dl/extractor/mangapark.py | 123 +++++++++++++++++------------- 1 file changed, 71 insertions(+), 52 deletions(-) diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index 168fbe8417..9e3f072da5 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -9,15 +9,15 @@ """Extractors for https://mangapark.net/""" from .common import ChapterExtractor, MangaExtractor -from .. import text, util, exception +from .. import text, util import re +BASE_PATTERN = r"(?:https?://)?(?:www\.)?mangapark\.(?:net|com|org|io|me)" + class MangaparkBase(): """Base class for mangapark extractors""" category = "mangapark" - root_fmt = "https://v2.mangapark.{}" - browser = "firefox" @staticmethod def parse_chapter_path(path, data): @@ -50,66 +50,85 @@ def parse_chapter_title(title, data): class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): """Extractor for manga-chapters from mangapark.net""" - pattern = (r"(?:https?://)?(?:www\.|v2\.)?mangapark\.(me|net|com)" - r"/manga/([^?#]+/i\d+)") + pattern = BASE_PATTERN + r"/title/[^/?#]+/(\d+)" test = ( - ("https://mangapark.net/manga/gosu/i811653/c055/1", { - "count": 50, - "keyword": "db1ed9af4f972756a25dbfa5af69a8f155b043ff", - }), - (("https://mangapark.net/manga" - "/ad-astra-per-aspera-hata-kenjirou/i662051/c001.2/1"), { - "count": 40, - "keyword": "2bb3a8f426383ea13f17ff5582f3070d096d30ac", - }), - (("https://mangapark.net/manga" - "/gekkan-shoujo-nozaki-kun/i2067426/v7/c70/1"), { - "count": 15, - "keyword": "edc14993c4752cee3a76e09b2f024d40d854bfd1", + ("https://mangapark.net/title/114972-aria/6710214-en-ch.60.2", { + "count": 70, + "pattern": r"https://[\w-]+\.mpcdn\.org/comic/2002/e67" + r"/61e29278a583b9227964076e/\d+_\d+_\d+_\d+\.jpeg" + r"\?acc=[^&#]+&exp=\d+", + "keyword": { + "artist": [], + "author": ["Amano Kozue"], + "chapter": 60, + "chapter_id": 6710214, + "chapter_minor": ".2", + "count": 70, + "date": "dt:2022-01-15 09:25:03", + "extension": "jpeg", + "filename": str, + "genre": ["adventure", "comedy", "drama", "sci_fi", + "shounen", "slice_of_life"], + "lang": "en", + "language": "English", + "manga": "Aria", + "manga_id": 114972, + "page": int, + "source": "Koala", + "title": "Special Navigation - Aquaria Ii", + "volume": 12, + }, }), - ("https://mangapark.me/manga/gosu/i811615/c55/1"), - ("https://mangapark.com/manga/gosu/i811615/c55/1"), + ("https://mangapark.com/title/114972-aria/6710214-en-ch.60.2"), + ("https://mangapark.org/title/114972-aria/6710214-en-ch.60.2"), + ("https://mangapark.io/title/114972-aria/6710214-en-ch.60.2"), + ("https://mangapark.me/title/114972-aria/6710214-en-ch.60.2"), ) def __init__(self, match): - tld, self.path = match.groups() - self.root = self.root_fmt.format(tld) - url = "{}/manga/{}?zoom=2".format(self.root, self.path) + self.root = text.root_from_url(match.group(0)) + url = "{}/title/_/{}".format(self.root, match.group(1)) ChapterExtractor.__init__(self, match, url) def metadata(self, page): - data = text.extract_all(page, ( - ("manga_id" , "var _manga_id = '", "'"), - ("chapter_id", "var _book_id = '", "'"), - ("stream" , "var _stream = '", "'"), - ("path" , "var _book_link = '", "'"), - ("manga" , "

    ", "

    "), - ("title" , "
    ", "<"), - ), values={"lang": "en", "language": "English"})[0] - - if not data["path"]: - raise exception.NotFoundError("chapter") - - self.parse_chapter_path(data["path"], data) - if "chapter" not in data: - self.parse_chapter_title(data["title"], data) - - data["manga"], _, data["type"] = data["manga"].rpartition(" ") - data["manga"] = text.unescape(data["manga"]) - data["title"] = data["title"].partition(": ")[2] - for key in ("manga_id", "chapter_id", "stream"): - data[key] = text.parse_int(data[key]) - - return data + data = util.json_loads(text.extr( + page, 'id="__NEXT_DATA__" type="application/json">', '<')) + chapter = (data["props"]["pageProps"]["dehydratedState"] + ["queries"][0]["state"]["data"]["data"]) + manga = chapter["comicNode"]["data"] + source = chapter["sourceNode"]["data"] + + self._urls = chapter["imageSet"]["httpLis"] + self._params = chapter["imageSet"]["wordLis"] + + match = re.match( + r"(?i)" + r"(?:vol(?:\.|ume)?\s*(\d+)\s*)?" + r"ch(?:\.|apter)?\s*(\d+)([^\s:]*)" + r"(?:\s*:\s*(.*))?", chapter["dname"]) + vol, ch, minor, title = match.groups() if match else (0, 0, "", "") + + return { + "manga" : manga["name"], + "manga_id" : manga["id"], + "artist" : source["artists"], + "author" : source["authors"], + "genre" : source["genres"], + "volume" : text.parse_int(vol), + "chapter" : text.parse_int(ch), + "chapter_minor": minor, + "chapter_id": chapter["id"], + "title" : chapter["title"] or title or "", + "lang" : chapter["lang"], + "language" : util.code_to_language(chapter["lang"]), + "source" : chapter["srcTitle"], + "date" : text.parse_timestamp(chapter["dateCreate"] // 1000), + } def images(self, page): - data = util.json_loads(text.extr(page, "var _load_pages =", ";")) return [ - (text.urljoin(self.root, item["u"]), { - "width": text.parse_int(item["w"]), - "height": text.parse_int(item["h"]), - }) - for item in data + (url + "?" + params, None) + for url, params in zip(self._urls, self._params) ] From 3479646f655574a5501bb5a49e4930335e840cfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 30 Jun 2023 17:17:54 +0200 Subject: [PATCH 174/252] [mangapark] update and fix 'manga' extractor (#3969) TODO: - non-English chapters - 'source' option --- gallery_dl/extractor/mangapark.py | 351 +++++++++++++++++++++++------- setup.cfg | 1 + 2 files changed, 276 insertions(+), 76 deletions(-) diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index 9e3f072da5..f16d7e430c 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -8,7 +8,7 @@ """Extractors for https://mangapark.net/""" -from .common import ChapterExtractor, MangaExtractor +from .common import ChapterExtractor, Extractor, Message from .. import text, util import re @@ -18,34 +18,18 @@ class MangaparkBase(): """Base class for mangapark extractors""" category = "mangapark" + _match_title = None - @staticmethod - def parse_chapter_path(path, data): - """Get volume/chapter information from url-path of a chapter""" - data["volume"], data["chapter_minor"] = 0, "" - for part in path.split("/")[1:]: - key, value = part[0], part[1:] - if key == "c": - chapter, dot, minor = value.partition(".") - data["chapter"] = text.parse_int(chapter) - data["chapter_minor"] = dot + minor - elif key == "i": - data["chapter_id"] = text.parse_int(value) - elif key == "v": - data["volume"] = text.parse_int(value) - elif key == "s": - data["stream"] = text.parse_int(value) - elif key == "e": - data["chapter_minor"] = "v" + value - - @staticmethod - def parse_chapter_title(title, data): - match = re.search(r"(?i)(?:vol(?:ume)?[ .]*(\d+) )?" - r"ch(?:apter)?[ .]*(\d+)(\.\w+)?", title) - if match: - vol, ch, data["chapter_minor"] = match.groups() - data["volume"] = text.parse_int(vol) - data["chapter"] = text.parse_int(ch) + def _parse_chapter_title(self, title): + if not self._match_title: + MangaparkBase._match_title = re.compile( + r"(?i)" + r"(?:vol(?:\.|ume)?\s*(\d+)\s*)?" + r"ch(?:\.|apter)?\s*(\d+)([^\s:]*)" + r"(?:\s*:\s*(.*))?" + ).match + match = self._match_title(title) + return match.groups() if match else (0, 0, "", "") class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): @@ -100,13 +84,7 @@ def metadata(self, page): self._urls = chapter["imageSet"]["httpLis"] self._params = chapter["imageSet"]["wordLis"] - - match = re.match( - r"(?i)" - r"(?:vol(?:\.|ume)?\s*(\d+)\s*)?" - r"ch(?:\.|apter)?\s*(\d+)([^\s:]*)" - r"(?:\s*:\s*(.*))?", chapter["dname"]) - vol, ch, minor, title = match.groups() if match else (0, 0, "", "") + vol, ch, minor, title = self._parse_chapter_title(chapter["dname"]) return { "manga" : manga["name"], @@ -132,50 +110,271 @@ def images(self, page): ] -class MangaparkMangaExtractor(MangaparkBase, MangaExtractor): +class MangaparkMangaExtractor(MangaparkBase, Extractor): """Extractor for manga from mangapark.net""" - chapterclass = MangaparkChapterExtractor - pattern = (r"(?:https?://)?(?:www\.|v2\.)?mangapark\.(me|net|com)" - r"(/manga/[^/?#]+)/?$") + subcategory = "manga" + pattern = BASE_PATTERN + r"/title/(\d+)(?:-[^/?#]*)?/?$" test = ( - ("https://mangapark.net/manga/aria", { - "url": "51c6d82aed5c3c78e0d3f980b09a998e6a2a83ee", - "keyword": "cabc60cf2efa82749d27ac92c495945961e4b73c", + ("https://mangapark.net/title/114972-aria", { + "count": 141, + "pattern": MangaparkChapterExtractor.pattern, + "keyword": { + "chapter": int, + "chapter_id": int, + "chapter_minor": str, + "date": "type:datetime", + "lang": "en", + "language": "English", + "manga_id": 114972, + "source": "re:Horse|Koala", + "title": str, + "volume": int, + }, }), - ("https://mangapark.me/manga/aria"), - ("https://mangapark.com/manga/aria"), + ("https://mangapark.com/title/114972-"), + ("https://mangapark.com/title/114972"), + ("https://mangapark.com/title/114972-aria"), + ("https://mangapark.org/title/114972-aria"), + ("https://mangapark.io/title/114972-aria"), + ("https://mangapark.me/title/114972-aria"), ) def __init__(self, match): - self.root = self.root_fmt.format(match.group(1)) - MangaExtractor.__init__(self, match, self.root + match.group(2)) - - def chapters(self, page): - results = [] - data = {"lang": "en", "language": "English"} - data["manga"] = text.unescape( - text.extr(page, '', ' Manga - ')) - - for stream in page.split('<div id="stream_')[1:]: - data["stream"] = text.parse_int(text.extr(stream, '', '"')) - - for chapter in text.extract_iter(stream, '<li ', '</li>'): - path , pos = text.extract(chapter, 'href="', '"') - title1, pos = text.extract(chapter, '>', '<', pos) - title2, pos = text.extract(chapter, '>: </span>', '<', pos) - count , pos = text.extract(chapter, ' of ', ' ', pos) - - self.parse_chapter_path(path[8:], data) - if "chapter" not in data: - self.parse_chapter_title(title1, data) - - if title2: - data["title"] = title2.strip() - else: - data["title"] = title1.partition(":")[2].strip() - - data["count"] = text.parse_int(count) - results.append((self.root + path, data.copy())) - data.pop("chapter", None) - - return results + self.root = text.root_from_url(match.group(0)) + self.manga_id = int(match.group(1)) + Extractor.__init__(self, match) + + def items(self): + for chapter in self.chapters(): + chapter = chapter["data"] + url = self.root + chapter["urlPath"] + + vol, ch, minor, title = self._parse_chapter_title(chapter["dname"]) + data = { + "manga_id" : self.manga_id, + "volume" : text.parse_int(vol), + "chapter" : text.parse_int(ch), + "chapter_minor": minor, + "chapter_id": chapter["id"], + "title" : chapter["title"] or title or "", + "lang" : chapter["lang"], + "language" : util.code_to_language(chapter["lang"]), + "source" : chapter["srcTitle"], + "date" : text.parse_timestamp( + chapter["dateCreate"] // 1000), + "_extractor": MangaparkChapterExtractor, + } + yield Message.Queue, url, data + + def chapters(self): + source = self.config("source") + if source: + return self.chapters_source(source) + return self.chapters_all() + + def chapters_all(self): + pnum = 0 + variables = { + "select": { + "comicId": self.manga_id, + "range" : None, + "isAsc" : not self.config("chapter-reverse"), + } + } + + while True: + data = self._request_graphql( + "get_content_comicChapterRangeList", variables) + + for item in data["items"]: + yield from item["chapterNodes"] + + if not pnum: + pager = data["pager"] + pnum += 1 + + try: + variables["select"]["range"] = pager[pnum] + except IndexError: + return + + def chapters_source(self, source_id): + variables = { + "sourceId": source_id, + } + + yield from self._request_graphql( + "get_content_source_chapterList", variables) + + def _request_graphql(self, opname, variables): + url = self.root + "/apo/" + data = { + "query" : QUERIES[opname], + "variables" : util.json_dumps(variables), + "operationName": opname, + } + return self.request( + url, method="POST", json=data).json()["data"][opname] + + +QUERIES = { + "get_content_comicChapterRangeList": """ + query get_content_comicChapterRangeList($select: Content_ComicChapterRangeList_Select) { + get_content_comicChapterRangeList( + select: $select + ) { + reqRange{x y} + missing + pager {x y} + items{ + serial + chapterNodes { + + id + data { + + + id + sourceId + + dbStatus + isNormal + isHidden + isDeleted + isFinal + + dateCreate + datePublic + dateModify + lang + volume + serial + dname + title + urlPath + + srcTitle srcColor + + count_images + + stat_count_post_child + stat_count_post_reply + stat_count_views_login + stat_count_views_guest + + userId + userNode { + + id + data { + +id +name +uniq +avatarUrl +urlPath + +verified +deleted +banned + +dateCreate +dateOnline + +stat_count_chapters_normal +stat_count_chapters_others + +is_adm is_mod is_vip is_upr + + } + + } + + disqusId + + + } + + sser_read + } + } + + } + } +""", + + "get_content_source_chapterList": """ + query get_content_source_chapterList($sourceId: Int!) { + get_content_source_chapterList( + sourceId: $sourceId + ) { + + id + data { + + + id + sourceId + + dbStatus + isNormal + isHidden + isDeleted + isFinal + + dateCreate + datePublic + dateModify + lang + volume + serial + dname + title + urlPath + + srcTitle srcColor + + count_images + + stat_count_post_child + stat_count_post_reply + stat_count_views_login + stat_count_views_guest + + userId + userNode { + + id + data { + +id +name +uniq +avatarUrl +urlPath + +verified +deleted +banned + +dateCreate +dateOnline + +stat_count_chapters_normal +stat_count_chapters_others + +is_adm is_mod is_vip is_upr + + } + + } + + disqusId + + + } + + } + } +""", +} diff --git a/setup.cfg b/setup.cfg index 521edc5d4a..56d7108721 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,3 +4,4 @@ ignore = E203,E226,W504 per-file-ignores = setup.py: E501 gallery_dl/extractor/500px.py: E501 + gallery_dl/extractor/mangapark.py: E501 From 46cae04aa3a113c7b6bbee1bb468669564b14ae8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 30 Jun 2023 17:19:53 +0200 Subject: [PATCH 175/252] [piczel] update API server (#4244) --- gallery_dl/extractor/piczel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py index b03d6f842b..56c2978259 100644 --- a/gallery_dl/extractor/piczel.py +++ b/gallery_dl/extractor/piczel.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2022 Mike Fährmann +# Copyright 2018-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -19,7 +19,7 @@ class PiczelExtractor(Extractor): filename_fmt = "{category}_{id}_{title}_{num:>02}.{extension}" archive_fmt = "{id}_{num}" root = "https://piczel.tv" - api_root = "https://tombstone.piczel.tv" + api_root = root def items(self): for post in self.posts(): From 3845c0256de2216c42b4843780a3afa7a75870cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 1 Jul 2023 19:11:41 +0200 Subject: [PATCH 176/252] [sankaku] improve warnings for unavailable posts --- gallery_dl/extractor/sankaku.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index f36051bb46..09e5421d88 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -46,10 +46,15 @@ def skip(self, num): def _file_url(self, post): url = post["file_url"] - if not url and self._warning: - self.log.warning( - "Login required to download 'contentious_content' posts") - SankakuExtractor._warning = False + if not url: + if post["status"] != "active": + self.log.warning( + "Unable to download post %s (%s)", + post["id"], post["status"]) + elif self._warning: + self.log.warning( + "Login required to download 'contentious_content' posts") + SankakuExtractor._warning = False elif url[8] == "v": url = "https://s.sankakucomplex.com" + url[url.index("/", 8):] return url From c45a913bfd386754922549f63f0cae548199ab62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 1 Jul 2023 19:19:39 +0200 Subject: [PATCH 177/252] [flickr] add 'exif' option --- docs/configuration.rst | 12 ++++++++++++ docs/gallery-dl.conf | 6 ++++-- gallery_dl/extractor/flickr.py | 11 +++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 841f51cc58..d73cffef7a 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1541,6 +1541,18 @@ Description from `linking your Flickr account to gallery-dl <OAuth_>`__. +extractor.flickr.exif +--------------------- +Type + ``bool`` +Default + ``false`` +Description + Fetch `exif` and `camera` metadata for each photo. + + Note: This requires 1 additional API call per photo. + + extractor.flickr.metadata ------------------------- Type diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 6a3c84f415..902d0a2f7a 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -108,8 +108,10 @@ }, "flickr": { - "videos": true, - "size-max": null + "exif": false, + "metadata": false, + "size-max": null, + "videos": true }, "furaffinity": { diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index d7df3d7f4b..d44ff3c842 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -106,6 +106,8 @@ def __init__(self, match): def items(self): photo = self.api.photos_getInfo(self.item_id) + if self.api.exif: + photo.update(self.api.photos_getExif(self.item_id)) if photo["media"] == "video" and self.api.videos: self.api._extract_video(photo) @@ -323,6 +325,7 @@ class FlickrAPI(oauth.OAuth1API): def __init__(self, extractor): oauth.OAuth1API.__init__(self, extractor) + self.exif = extractor.config("exif", False) self.videos = extractor.config("videos", True) self.maxsize = extractor.config("size-max") if isinstance(self.maxsize, str): @@ -367,6 +370,11 @@ def people_getPhotos(self, user_id): params = {"user_id": user_id} return self._pagination("people.getPhotos", params) + def photos_getExif(self, photo_id): + """Retrieves a list of EXIF/TIFF/GPS tags for a given photo.""" + params = {"photo_id": photo_id} + return self._call("photos.getExif", params)["photo"] + def photos_getInfo(self, photo_id): """Get information about a photo.""" params = {"photo_id": photo_id} @@ -488,6 +496,9 @@ def _extract_format(self, photo): photo["views"] = text.parse_int(photo["views"]) photo["date"] = text.parse_timestamp(photo["dateupload"]) photo["tags"] = photo["tags"].split() + + if self.exif: + photo.update(self.photos_getExif(photo["id"])) photo["id"] = text.parse_int(photo["id"]) if "owner" in photo: From 6ae3101fd0025ddd33fc9bfe5f09bc4dd593e2e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 2 Jul 2023 15:07:22 +0200 Subject: [PATCH 178/252] [mangapark] add 'source' option (#3969) --- docs/configuration.rst | 18 ++++++ gallery_dl/extractor/mangapark.py | 104 ++++++++++++++++++++++++++++-- 2 files changed, 115 insertions(+), 7 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index d73cffef7a..8c690dd75c 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2110,6 +2110,24 @@ Description List of acceptable content ratings for returned chapters. +extractor.mangapark.source +-------------------------- +Type + * ``string`` + * ``integer`` +Example + * ``"koala:en"`` + * ``15150116`` +Description + Select chapter source and language for a manga. + + | The general syntax is ``"<source name>:<ISO 639-1 language code>"``. + | Both are optional, meaning ``"koala"``, ``"koala:"``, ``":en"``, + or even just ``":"`` are possible as well. + + Specifying the numeric ``ID`` of a source is also supported. + + extractor.[mastodon].access-token --------------------------------- Type diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index f16d7e430c..e6c3b9850e 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -9,7 +9,7 @@ """Extractors for https://mangapark.net/""" from .common import ChapterExtractor, Extractor, Message -from .. import text, util +from .. import text, util, exception import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?mangapark\.(?:net|com|org|io|me)" @@ -99,7 +99,8 @@ def metadata(self, page): "title" : chapter["title"] or title or "", "lang" : chapter["lang"], "language" : util.code_to_language(chapter["lang"]), - "source" : chapter["srcTitle"], + "source" : source["srcTitle"], + "source_id" : source["id"], "date" : text.parse_timestamp(chapter["dateCreate"] // 1000), } @@ -127,10 +128,21 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor): "language": "English", "manga_id": 114972, "source": "re:Horse|Koala", + "source_id": int, "title": str, "volume": int, }, }), + # 'source' option + ("https://mangapark.net/title/114972-aria", { + "options": (("source", "koala"),), + "count": 70, + "pattern": MangaparkChapterExtractor.pattern, + "keyword": { + "source": "Koala", + "source_id": 15150116, + }, + }), ("https://mangapark.com/title/114972-"), ("https://mangapark.com/title/114972"), ("https://mangapark.com/title/114972-aria"), @@ -168,9 +180,12 @@ def items(self): def chapters(self): source = self.config("source") - if source: - return self.chapters_source(source) - return self.chapters_all() + if not source: + return self.chapters_all() + + source_id = self._select_source(source) + self.log.debug("Requesting chapters for source_id %s", source_id) + return self.chapters_source(source_id) def chapters_all(self): pnum = 0 @@ -202,10 +217,35 @@ def chapters_source(self, source_id): variables = { "sourceId": source_id, } - - yield from self._request_graphql( + chapters = self._request_graphql( "get_content_source_chapterList", variables) + if self.config("chapter-reverse"): + chapters.reverse() + return chapters + + def _select_source(self, source): + if isinstance(source, int): + return source + + group, _, lang = source.partition(":") + group = group.lower() + + variables = { + "comicId" : self.manga_id, + "dbStatuss" : ["normal"], + "haveChapter": True, + } + for item in self._request_graphql( + "get_content_comic_sources", variables): + data = item["data"] + if (not group or data["srcTitle"].lower() == group) and ( + not lang or data["lang"] == lang): + return data["id"] + + raise exception.StopExtraction( + "'%s' does not match any available source", source) + def _request_graphql(self, opname, variables): url = self.root + "/apo/" data = { @@ -377,4 +417,54 @@ def _request_graphql(self, opname, variables): } } """, + + "get_content_comic_sources": """ + query get_content_comic_sources($comicId: Int!, $dbStatuss: [String] = [], $userId: Int, $haveChapter: Boolean, $sortFor: String) { + get_content_comic_sources( + comicId: $comicId + dbStatuss: $dbStatuss + userId: $userId + haveChapter: $haveChapter + sortFor: $sortFor + ) { + +id +data{ + + id + + dbStatus + isNormal + isHidden + isDeleted + + lang name altNames authors artists + + release + genres summary{code} extraInfo{code} + + urlCover600 + urlCover300 + urlCoverOri + + srcTitle srcColor + + chapterCount + chapterNode_last { + id + data { + dateCreate datePublic dateModify + volume serial + dname title + urlPath + userNode { + id data {uniq name} + } + } + } +} + + } + } +""", } From 3d8de383bf1bbcc0c6a4f835e847fdd78083b2ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 2 Jul 2023 15:17:10 +0200 Subject: [PATCH 179/252] [mangapark] extract 'source_id' for manga forgot to add this to 6ae3101f --- gallery_dl/extractor/mangapark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index e6c3b9850e..a0d1e80b91 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -172,6 +172,7 @@ def items(self): "lang" : chapter["lang"], "language" : util.code_to_language(chapter["lang"]), "source" : chapter["srcTitle"], + "source_id" : chapter["sourceId"], "date" : text.parse_timestamp( chapter["dateCreate"] // 1000), "_extractor": MangaparkChapterExtractor, From 5457007dd3597788c066b4faea2cc5ee166e8740 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 2 Jul 2023 22:25:45 +0200 Subject: [PATCH 180/252] release version 1.25.7 --- CHANGELOG.md | 16 ++++++++++++++++ README.rst | 4 ++-- gallery_dl/version.py | 2 +- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 429c7eaf92..b71b404378 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,21 @@ # Changelog +## 1.25.7 - 2023-07-02 +### Additions +- [flickr] add 'exif' option +- [flickr] add 'metadata' option ([#4227](https://github.com/mikf/gallery-dl/issues/4227)) +- [mangapark] add 'source' option ([#3969](https://github.com/mikf/gallery-dl/issues/3969)) +- [twitter] extend 'conversations' option ([#4211](https://github.com/mikf/gallery-dl/issues/4211)) +### Fixes +- [furaffinity] improve 'description' HTML ([#4224](https://github.com/mikf/gallery-dl/issues/4224)) +- [gelbooru_v01] fix '--range' ([#4167](https://github.com/mikf/gallery-dl/issues/4167)) +- [hentaifox] fix titles containing '@' ([#4201](https://github.com/mikf/gallery-dl/issues/4201)) +- [mangapark] update to v5 ([#3969](https://github.com/mikf/gallery-dl/issues/3969)) +- [piczel] update API server address ([#4244](https://github.com/mikf/gallery-dl/issues/4244)) +- [poipiku] improve error detection ([#4206](https://github.com/mikf/gallery-dl/issues/4206)) +- [sankaku] improve warnings for unavailable posts +- [senmanga] ensure download URLs have a scheme ([#4235](https://github.com/mikf/gallery-dl/issues/4235)) + ## 1.25.6 - 2023-06-17 ### Additions - [blogger] download files from `lh*.googleusercontent.com` ([#4070](https://github.com/mikf/gallery-dl/issues/4070)) diff --git a/README.rst b/README.rst index 44cbfb383a..86dd58deae 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.6/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.7/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.6/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.7/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl/version.py b/gallery_dl/version.py index e25ba45637..9438d736ff 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.7-dev" +__version__ = "1.25.7" From 9576652fa5ded8a85dd9bbbf69585b9ff4fa924c Mon Sep 17 00:00:00 2001 From: FrostTheFox <thunderfox@thunderfox.nl> Date: Tue, 4 Jul 2023 02:35:48 -0400 Subject: [PATCH 181/252] extract & pass auth token for newgrounds --- gallery_dl/extractor/newgrounds.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 5d100a49f4..30bde549ec 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -90,11 +90,13 @@ def _login_impl(self, username, password): headers = {"Origin": self.root, "Referer": url} url = text.urljoin(self.root, text.extr( response.text, 'action="', '"')) + auth_token = text.extr(response.text, 'name="auth" value="', '"') data = { "username": username, "password": password, "remember": "1", "login" : "1", + "auth" : auth_token } response = self.request(url, method="POST", headers=headers, data=data) From a78f8ce5b0e907c3de654a539b5921f00b2c54c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 4 Jul 2023 17:36:41 +0200 Subject: [PATCH 182/252] [paheal] fix extraction (#4262) swap ' and " --- gallery_dl/extractor/paheal.py | 27 ++++++++++++++------------- gallery_dl/version.py | 2 +- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index f0a50c8416..d928ebd7de 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -96,8 +96,9 @@ def get_posts(self): url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) page = self.request(url).text + pos = page.find("id='image-list'") for post in text.extract_iter( - page, '<img id="thumb_', 'Only</a>'): + page, "<img id='thumb_", "Only</a>", pos): yield self._extract_data(post) if ">Next<" not in page: @@ -106,10 +107,10 @@ def get_posts(self): @staticmethod def _extract_data(post): - pid , pos = text.extract(post, '', '"') - data, pos = text.extract(post, 'title="', '"', pos) - md5 , pos = text.extract(post, '/_thumbs/', '/', pos) - url , pos = text.extract(post, '<a href="', '"', pos) + pid , pos = text.extract(post, "", "'") + data, pos = text.extract(post, "title='", "'", pos) + md5 , pos = text.extract(post, "/_thumbs/", "/", pos) + url , pos = text.extract(post, "<a href='", "'", pos) tags, data, date = data.split("\n") dimensions, size, ext = data.split(" // ") @@ -139,19 +140,19 @@ class PahealPostExtractor(PahealExtractor): ("https://rule34.paheal.net/post/view/481609", { "pattern": r"https://tulip\.paheal\.net/_images" r"/bbdc1c33410c2cdce7556c7990be26b7/481609%20-%20" - r"Azumanga_Daioh%20Osaka%20Vuvuzela%20inanimate\.jpg", + r"Azumanga_Daioh%20inanimate%20Osaka%20Vuvuzela\.jpg", "content": "7b924bcf150b352ac75c9d281d061e174c851a11", "keyword": { "date": "dt:2010-06-17 15:40:23", "extension": "jpg", "file_url": "re:https://tulip.paheal.net/_images/bbdc1c33410c", - "filename": "481609 - Azumanga_Daioh Osaka Vuvuzela inanimate", + "filename": "481609 - Azumanga_Daioh inanimate Osaka Vuvuzela", "height": 660, "id": 481609, "md5": "bbdc1c33410c2cdce7556c7990be26b7", "size": 157389, "source": None, - "tags": "Azumanga_Daioh Osaka Vuvuzela inanimate", + "tags": "Azumanga_Daioh inanimate Osaka Vuvuzela", "uploader": "CaptainButtface", "width": 614, }, @@ -163,7 +164,7 @@ class PahealPostExtractor(PahealExtractor): "md5": "b39edfe455a0381110c710d6ed2ef57d", "size": 758989, "source": "http://www.furaffinity.net/view/4057821/", - "tags": "Vuvuzela inanimate thelost-dragon", + "tags": "inanimate thelost-dragon Vuvuzela", "uploader": "leacheate_soup", "width": 1200, }, @@ -171,8 +172,8 @@ class PahealPostExtractor(PahealExtractor): # video ("https://rule34.paheal.net/post/view/3864982", { "pattern": r"https://[\w]+\.paheal\.net/_images/7629fc0ff77e32637d" - r"de5bf4f992b2cb/3864982%20-%20Metal_Gear%20Metal_Gear_" - r"Solid_V%20Quiet%20Vg_erotica%20animated%20webm\.webm", + r"de5bf4f992b2cb/3864982%20-%20animated%20Metal_Gear%20" + r"Metal_Gear_Solid_V%20Quiet%20Vg_erotica%20webm\.webm", "keyword": { "date": "dt:2020-09-06 01:59:03", "duration": 30.0, @@ -183,8 +184,8 @@ class PahealPostExtractor(PahealExtractor): "size": 18454938, "source": "https://twitter.com/VG_Worklog" "/status/1302407696294055936", - "tags": "Metal_Gear Metal_Gear_Solid_V Quiet " - "Vg_erotica animated webm", + "tags": "animated Metal_Gear Metal_Gear_Solid_V " + "Quiet Vg_erotica webm", "uploader": "justausername", "width": 1768, }, diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 9438d736ff..39cfbd1c5d 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.7" +__version__ = "1.26.0-dev" From 1d4db83d4939377ea195c9756aadfee1cbe7729b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 4 Jul 2023 17:41:22 +0200 Subject: [PATCH 183/252] [weibo] fix end of cursor based pagination --- gallery_dl/extractor/weibo.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 805aa536c4..5a3adc80b4 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -170,6 +170,8 @@ def _pagination(self, endpoint, params): yield from statuses if "next_cursor" in data: # videos, newvideo + if data["next_cursor"] == -1: + return params["cursor"] = data["next_cursor"] elif "page" in params: # home, article params["page"] += 1 From f86fdf64a64091e10176bfcd87fd07db635e7b93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 4 Jul 2023 17:55:22 +0200 Subject: [PATCH 184/252] [twitter] use GraphQL search by default (#4264) --- docs/configuration.rst | 7 +++---- gallery_dl/extractor/twitter.py | 10 +++++----- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 8c690dd75c..0a2085e97b 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3221,13 +3221,12 @@ extractor.twitter.search-endpoint Type ``string`` Default - ``"auto"`` + ``"graphql"`` Description Selects the API endpoint used to retrieve search results. - * ``"rest"``: Legacy REST endpoint - returns a ``403 Forbidden`` error when not logged in - * ``"graphql"``: New GraphQL endpoint - * ``"auto"``: ``"rest"`` when logged in, ``"graphql"`` otherwise + * ``"graphql"``: GraphQL endpoint + * ``"rest"``: Legacy REST endpoint extractor.twitter.timeline.strategy diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 10db974405..b3064cc9cd 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -510,13 +510,13 @@ def tweets(self): if not self.textonly: # try to search for media-only tweets tweet = None - for tweet in self.api.search_adaptive(query + " filter:links"): + for tweet in self.api.search_timeline(query + " filter:links"): yield tweet if tweet is not None: return # yield unfiltered search results - yield from self.api.search_adaptive(query) + yield from self.api.search_timeline(query) def _select_tweet_source(self): strategy = self.config("strategy") @@ -693,7 +693,7 @@ def tweets(self): except KeyError: pass - return self.api.search_adaptive(query) + return self.api.search_timeline(query) class TwitterHashtagExtractor(TwitterExtractor): @@ -1087,8 +1087,8 @@ def __init__(self, extractor): auth_token = cookies.get("auth_token", domain=cookiedomain) search = extractor.config("search-endpoint") - if search == "graphql" or not auth_token and search in ("auto", None): - self.search_adaptive = self.search_timeline + if search == "rest": + self.search_timeline = self.search_adaptive self.headers = { "Accept": "*/*", From 1bf9f52c997fa919eeb94d8a28c70afc2e42a18a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 4 Jul 2023 18:17:32 +0200 Subject: [PATCH 185/252] [twitter] add 'ratelimit' option (#4251) --- docs/configuration.rst | 13 +++++++++++++ gallery_dl/extractor/twitter.py | 3 +++ 2 files changed, 16 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index 0a2085e97b..2608d2ab86 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3183,6 +3183,19 @@ Description a quoted (original) Tweet when it sees the Tweet which quotes it. +extractor.twitter.ratelimit +--------------------------- +Type + ``string`` +Default + ``"wait"`` +Description + Selects how to handle exceeding the API rate limit. + + * ``"abort"``: Raise an error and stop extraction + * ``"wait"``: Wait until rate limit reset + + extractor.twitter.replies ------------------------- Type diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index b3064cc9cd..c0b4ab615f 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1439,6 +1439,9 @@ def _call(self, endpoint, params, method="GET", auth=True, root=None): if response.status_code == 429: # rate limit exceeded + if self.extractor.config("ratelimit") == "abort": + raise exception.StopExtraction("Rate limit exceeded") + until = response.headers.get("x-rate-limit-reset") seconds = None if until else 60 self.extractor.wait(until=until, seconds=seconds) From a16d7c59cbb0a323a6a21b53e8f7d8940c4c0c27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 4 Jul 2023 21:49:57 +0200 Subject: [PATCH 186/252] [newgrounds] access 'response.text' only once --- gallery_dl/extractor/newgrounds.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 30bde549ec..e047f3df2a 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -87,16 +87,15 @@ def _login_impl(self, username, password): if response.history and response.url.endswith("/social"): return self.session.cookies + page = response.text headers = {"Origin": self.root, "Referer": url} - url = text.urljoin(self.root, text.extr( - response.text, 'action="', '"')) - auth_token = text.extr(response.text, 'name="auth" value="', '"') + url = text.urljoin(self.root, text.extr(page, 'action="', '"')) data = { "username": username, "password": password, "remember": "1", "login" : "1", - "auth" : auth_token + "auth" : text.extr(page, 'name="auth" value="', '"'), } response = self.request(url, method="POST", headers=headers, data=data) From 5b59a0d143ff9da76df1e5a25a58409a6434481e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 5 Jul 2023 15:12:50 +0200 Subject: [PATCH 187/252] update default User-Agent header to Firefox 115 ESR --- docs/configuration.rst | 2 +- docs/gallery-dl.conf | 2 +- gallery_dl/extractor/common.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 2608d2ab86..193f35cfba 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -525,7 +525,7 @@ extractor.*.user-agent Type ``string`` Default - ``"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0"`` + ``"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0"`` Description User-Agent header value to be used for HTTP requests. diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 902d0a2f7a..4a08eb8cb8 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -10,7 +10,7 @@ "proxy": null, "skip": true, - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0", "retries": 4, "timeout": 30.0, "verify": true, diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 50d1026c70..5c9b157787 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -286,7 +286,7 @@ def _init_session(self): useragent = self.config("user-agent") if useragent is None: useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; " - "rv:102.0) Gecko/20100101 Firefox/102.0") + "rv:115.0) Gecko/20100101 Firefox/115.0") elif useragent == "browser": useragent = _browser_useragent() headers["User-Agent"] = useragent @@ -805,8 +805,8 @@ def _browser_useragent(): HTTP_HEADERS = { "firefox": ( - ("User-Agent", "Mozilla/5.0 ({}; rv:102.0) " - "Gecko/20100101 Firefox/102.0"), + ("User-Agent", "Mozilla/5.0 ({}; rv:115.0) " + "Gecko/20100101 Firefox/115.0"), ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," "image/avif,image/webp,*/*;q=0.8"), ("Accept-Language", "en-US,en;q=0.5"), From e9b9f751bfe9f743880fdadc7076d783b70f3266 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 6 Jul 2023 16:03:35 +0200 Subject: [PATCH 188/252] [gfycat] support '@me' user (#3770, #4271) --- gallery_dl/extractor/gfycat.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py index 0ccd7fa55e..2d057f4866 100644 --- a/gallery_dl/extractor/gfycat.py +++ b/gallery_dl/extractor/gfycat.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2022 Mike Fährmann +# Copyright 2017-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -80,6 +80,8 @@ class GfycatUserExtractor(GfycatExtractor): }) def gfycats(self): + if self.key == "me": + return GfycatAPI(self).me() return GfycatAPI(self).user(self.key) @@ -220,15 +222,6 @@ class GfycatAPI(): def __init__(self, extractor): self.extractor = extractor - def gfycat(self, gfycat_id): - endpoint = "/v1/gfycats/" + gfycat_id - return self._call(endpoint)["gfyItem"] - - def user(self, user): - endpoint = "/v1/users/{}/gfycats".format(user.lower()) - params = {"count": 100} - return self._pagination(endpoint, params) - def collection(self, user, collection): endpoint = "/v1/users/{}/collections/{}/gfycats".format( user, collection) @@ -240,11 +233,25 @@ def collections(self, user): params = {"count": 100} return self._pagination(endpoint, params, "gfyCollections") + def gfycat(self, gfycat_id): + endpoint = "/v1/gfycats/" + gfycat_id + return self._call(endpoint)["gfyItem"] + + def me(self): + endpoint = "/v1/me/gfycats" + params = {"count": 100} + return self._pagination(endpoint, params) + def search(self, query): endpoint = "/v1/gfycats/search" params = {"search_text": query, "count": 150} return self._pagination(endpoint, params) + def user(self, user): + endpoint = "/v1/users/{}/gfycats".format(user.lower()) + params = {"count": 100} + return self._pagination(endpoint, params) + def _call(self, endpoint, params=None): url = self.API_ROOT + endpoint return self.extractor.request(url, params=params).json() From 7444fc125bacef8ce4db2e7e99aa80d3ee562f2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 6 Jul 2023 18:56:34 +0200 Subject: [PATCH 189/252] [gfycat] implement login support (#3770, #4271) For the record: '/webtoken' and '/weblogin' are not the same ... --- docs/configuration.rst | 1 + docs/supportedsites.md | 2 +- gallery_dl/extractor/gfycat.py | 41 +++++++++++++++++++++++++++++++++- scripts/supportedsites.py | 1 + test/test_results.py | 2 +- 5 files changed, 44 insertions(+), 3 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 193f35cfba..75033307c4 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -382,6 +382,7 @@ Description * ``e621`` (*) * ``e926`` (*) * ``exhentai`` + * ``gfycat`` * ``idolcomplex`` * ``imgbb`` * ``inkbunny`` diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 444e4db59e..a3c0ee10bd 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -251,7 +251,7 @@ Consider all sites to be NSFW unless otherwise known. <td>Gfycat</td> <td>https://gfycat.com/</td> <td>Collections, individual Images, Search Results, User Profiles</td> - <td></td> + <td>Supported</td> </tr> <tr> <td>Gofile</td> diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py index 2d057f4866..ccebdf9886 100644 --- a/gallery_dl/extractor/gfycat.py +++ b/gallery_dl/extractor/gfycat.py @@ -10,6 +10,7 @@ from .common import Extractor, Message from .. import text, exception +from ..cache import cache class GfycatExtractor(Extractor): @@ -221,6 +222,8 @@ class GfycatAPI(): def __init__(self, extractor): self.extractor = extractor + self.headers = {} + self.username, self.password = extractor._get_auth_info() def collection(self, user, collection): endpoint = "/v1/users/{}/collections/{}/gfycats".format( @@ -252,9 +255,45 @@ def user(self, user): params = {"count": 100} return self._pagination(endpoint, params) + def authenticate(self): + self.headers["Authorization"] = \ + self._authenticate_impl(self.username, self.password) + + @cache(maxage=3600, keyarg=1) + def _authenticate_impl(self, username, password): + self.extractor.log.info("Logging in as %s", username) + + url = "https://weblogin.gfycat.com/oauth/webtoken" + headers = {"Origin": "https://gfycat.com"} + data = { + "access_key": "Anr96uuqt9EdamSCwK4txKPjMsf2" + "M95Rfa5FLLhPFucu8H5HTzeutyAa", + } + response = self.extractor.request( + url, method="POST", headers=headers, json=data).json() + + url = "https://weblogin.gfycat.com/oauth/weblogin" + headers["authorization"] = "Bearer " + response["access_token"] + data = { + "grant_type": "password", + "username" : username, + "password" : password, + } + response = self.extractor.request( + url, method="POST", headers=headers, json=data, fatal=None).json() + + if "errorMessage" in response: + raise exception.AuthenticationError( + response["errorMessage"]["description"]) + return "Bearer " + response["access_token"] + def _call(self, endpoint, params=None): + if self.username: + self.authenticate() + url = self.API_ROOT + endpoint - return self.extractor.request(url, params=params).json() + return self.extractor.request( + url, params=params, headers=self.headers).json() def _pagination(self, endpoint, params, key="gfycats"): while True: diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 78e6843eaa..fb6ffa7ba7 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -312,6 +312,7 @@ "fanbox" : _COOKIES, "fantia" : _COOKIES, "flickr" : _OAUTH, + "gfycat" : "Supported", "furaffinity" : _COOKIES, "horne" : "Required", "idolcomplex" : "Supported", diff --git a/test/test_results.py b/test/test_results.py index 03a17c40ec..3c7d2844f7 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -325,7 +325,7 @@ def setup_test_config(): for category in ("danbooru", "atfbooru", "aibooru", "e621", "e926", "instagram", "twitter", "subscribestar", "deviantart", "inkbunny", "tapas", "pillowfort", "mangadex", - "vipergirls"): + "vipergirls", "gfycat"): config.set(("extractor", category), "username", None) config.set(("extractor", "mastodon.social"), "access-token", From c2ac665ff7a73c30e23f3fd8831466b12ce10888 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 6 Jul 2023 19:03:53 +0200 Subject: [PATCH 190/252] [fantia] send 'X-Requested-With' header (#4273) --- gallery_dl/extractor/fantia.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 35c4cc4517..8186c55a9b 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -23,6 +23,7 @@ def items(self): self.headers = { "Accept" : "application/json, text/plain, */*", "Referer": self.root, + "X-Requested-With": "XMLHttpRequest", } _empty_plan = { "id" : 0, From 384337d3dd817816642470eafb85138b719350e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 7 Jul 2023 15:16:18 +0200 Subject: [PATCH 191/252] [fantia] send 'X-Requested-With' header only for API requests (#4273) --- gallery_dl/extractor/fantia.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 8186c55a9b..f92b904633 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -69,7 +69,8 @@ def posts(self): def _pagination(self, url): params = {"page": 1} - headers = self.headers + headers = self.headers.copy() + del headers["X-Requested-With"] while True: page = self.request(url, params=params, headers=headers).text From b480b7076a51bafab76993282f85b4eae5621819 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 7 Jul 2023 20:00:49 +0200 Subject: [PATCH 192/252] [paheal] fix a78f8ce5 for enabled 'metadata' (#4262) --- gallery_dl/extractor/paheal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index d928ebd7de..6152a93a4b 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -127,7 +127,7 @@ def _extract_data(post): } def _extract_data_ex(self, post): - pid = post[:post.index('"')] + pid = post[:post.index("'")] return self._extract_post(pid) From f0cb9515660c2d980e97cccada94de09d814d362 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 7 Jul 2023 20:03:00 +0200 Subject: [PATCH 193/252] [paheal] unescape 'source' --- gallery_dl/extractor/paheal.py | 38 +++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 6152a93a4b..1fa571c43b 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -55,8 +55,8 @@ def _extract_post(self, post_id): "class='username' href='/user/", "'")), "date" : text.parse_datetime( extr("datetime='", "'"), "%Y-%m-%dT%H:%M:%S%z"), - "source" : text.extract( - extr(">Source Link<", "</td>"), "href='", "'")[0], + "source" : text.unescape(text.extr( + extr(">Source Link<", "</td>"), "href='", "'")), } dimensions, size, ext = extr("Info</th><td>", ">").split(" // ") @@ -74,10 +74,34 @@ class PahealTagExtractor(PahealExtractor): directory_fmt = ("{category}", "{search_tags}") pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" r"/post/list/([^/?#]+)") - test = ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", { - "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20", - "count": ">= 15" - }) + test = ( + ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", { + "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20", + "count": ">= 15" + }), + ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", { + "range": "1", + "options": (("metadata", True),), + "keyword": { + "date": "dt:2018-01-07 07:04:05", + "duration": 0.0, + "extension": "jpg", + "filename": "2446128 - Ayane_Suzuki Idolmaster " + "idolmaster_dearly_stars Zanzi", + "height": 768, + "id": 2446128, + "md5": "b0ceda9d860df1d15b60293a7eb465c1", + "search_tags": "Ayane_Suzuki", + "size": 205312, + "source": "https://www.pixiv.net/member_illust.php" + "?mode=medium&illust_id=19957280", + "tags": "Ayane_Suzuki Idolmaster " + "idolmaster_dearly_stars Zanzi", + "uploader": "XXXname", + "width": 1024, + }, + }), + ) per_page = 70 def __init__(self, match): @@ -151,7 +175,7 @@ class PahealPostExtractor(PahealExtractor): "id": 481609, "md5": "bbdc1c33410c2cdce7556c7990be26b7", "size": 157389, - "source": None, + "source": "", "tags": "Azumanga_Daioh inanimate Osaka Vuvuzela", "uploader": "CaptainButtface", "width": 614, From 88d1e2940146939b4b62dc18cdc2b603ed710d9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 7 Jul 2023 20:10:28 +0200 Subject: [PATCH 194/252] [bunkr] use '.la' TLD for 'media-files12' servers (#4147, #4276) --- gallery_dl/extractor/bunkr.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 5c8c530fc8..35b275255d 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -52,6 +52,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): "num": int, }, }), + # cdn12 .ru TLD (#4147) + ("https://bunkrr.su/a/j1G29CnD", { + "pattern": r"https://(cdn12.bunkr.ru|media-files12.bunkr.la)/\w+", + "count": 8, + }), ("https://bunkrr.su/a/Lktg9Keq"), ("https://bunkr.la/a/Lktg9Keq"), ("https://bunkr.su/a/Lktg9Keq"), @@ -87,10 +92,12 @@ def fetch_album(self, album_id): url = text.unescape(url) if url.endswith((".mp4", ".m4v", ".mov", ".webm", ".mkv", ".ts", ".zip", ".rar", ".7z")): - append({"file": url.replace("://cdn", "://media-files", 1), - "_http_headers": headers}) - else: - append({"file": url}) + if url.startswith("https://cdn12."): + url = ("https://media-files12.bunkr.la" + + url[url.find("/", 14):]) + else: + url = url.replace("://cdn", "://media-files", 1) + append({"file": url, "_http_headers": headers}) return files, { "album_id" : self.album_id, From d5b6802774604a571a678c1930ce71e11e9dcb5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 7 Jul 2023 20:51:49 +0200 Subject: [PATCH 195/252] [seiga] set 'skip_fetish_warning' cookie (#4242) --- gallery_dl/extractor/seiga.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py index 7b8d2a319e..711435ef91 100644 --- a/gallery_dl/extractor/seiga.py +++ b/gallery_dl/extractor/seiga.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -164,6 +164,10 @@ class SeigaImageExtractor(SeigaExtractor): ("https://seiga.nicovideo.jp/seiga/im123", { "exception": exception.NotFoundError, }), + ("https://seiga.nicovideo.jp/seiga/im10877923", { + "pattern": r"https://lohas\.nicoseiga\.jp/priv/5936a2a6c860a600e46" + r"5e0411c0822e0b510e286/1688757110/10877923", + }), ("https://seiga.nicovideo.jp/image/source/5977527"), ("https://sp.seiga.nicovideo.jp/seiga/#!/im5977527"), ("https://lohas.nicoseiga.jp/thumb/5977527i"), @@ -182,6 +186,9 @@ def skip(self, num): return num def get_images(self): + self.session.cookies.set( + "skip_fetish_warning", "1", domain="seiga.nicovideo.jp") + url = "{}/seiga/im{}".format(self.root, self.image_id) page = self.request(url, notfound="image").text From 6cbc434b54e285d901f6e0a9c3682175facc368b Mon Sep 17 00:00:00 2001 From: ActuallyKit <54536660+ActuallyKit@users.noreply.github.com> Date: Sun, 9 Jul 2023 02:28:35 +0700 Subject: [PATCH 196/252] Fix users pagination --- gallery_dl/extractor/twitter.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index c0b4ab615f..8cf8f19454 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1762,9 +1762,8 @@ def _pagination_users(self, endpoint, variables, path=None): yield user elif entry["entryId"].startswith("cursor-bottom-"): cursor = entry["content"]["value"] - elif instr["type"] == "TimelineTerminateTimeline": - if instr["direction"] == "Bottom": - stop = True + if (cursor.startswith("-1|") or cursor.startswith("0|")): + stop = True if stop or not cursor or not entry: return From a437a34bcf925243edee0f43e5cc86b11e1610b4 Mon Sep 17 00:00:00 2001 From: ActuallyKit <54536660+ActuallyKit@users.noreply.github.com> Date: Sun, 9 Jul 2023 02:41:46 +0700 Subject: [PATCH 197/252] fix lint i guess? --- gallery_dl/extractor/twitter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 8cf8f19454..420212a1b7 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1762,7 +1762,9 @@ def _pagination_users(self, endpoint, variables, path=None): yield user elif entry["entryId"].startswith("cursor-bottom-"): cursor = entry["content"]["value"] - if (cursor.startswith("-1|") or cursor.startswith("0|")): + if (cursor.startswith("-1|")): + stop = True + if (cursor.startswith("0|")): stop = True if stop or not cursor or not entry: From c321c773f2cbc8cd1cb4ced405c7abcbac705b96 Mon Sep 17 00:00:00 2001 From: ActuallyKit <54536660+ActuallyKit@users.noreply.github.com> Date: Sun, 9 Jul 2023 02:52:04 +0700 Subject: [PATCH 198/252] make the code less ugly --- gallery_dl/extractor/twitter.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 420212a1b7..ff2e858c64 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1762,9 +1762,8 @@ def _pagination_users(self, endpoint, variables, path=None): yield user elif entry["entryId"].startswith("cursor-bottom-"): cursor = entry["content"]["value"] - if (cursor.startswith("-1|")): - stop = True - if (cursor.startswith("0|")): + if cursor.startswith("-1|") or \ + cursor.startswith("0|"): stop = True if stop or not cursor or not entry: From d3d639a15960380a561ff054a8363838d495bdaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 8 Jul 2023 22:49:34 +0200 Subject: [PATCH 199/252] [twitter] don't treat missing 'TimelineAddEntries' as fatal (#4278) --- gallery_dl/extractor/twitter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index c0b4ab615f..8c3fe8e4f0 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1595,7 +1595,9 @@ def _pagination_tweets(self, endpoint, variables, if entry["entryId"].startswith("cursor-bottom-"): cursor = entry["content"]["value"] if entries is None: - raise KeyError() + if not cursor: + return + entries = () except LookupError: extr.log.debug(data) From a27dbe8c820d82291df76afd4fdcefb306c72513 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 8 Jul 2023 22:58:33 +0200 Subject: [PATCH 200/252] [twitter] use 'TweetResultByRestId' endpoint (#4250) allows accessing single Tweets without login --- gallery_dl/extractor/twitter.py | 57 +++++++++++++++++++++++++++------ gallery_dl/version.py | 2 +- 2 files changed, 49 insertions(+), 10 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 8c3fe8e4f0..3fe05738e3 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -929,16 +929,15 @@ def tweets(self): def _tweets_single(self, tweet_id): tweets = [] - for tweet in self.api.tweet_detail(tweet_id): - if tweet["rest_id"] == tweet_id or \ - tweet.get("_retweet_id_str") == tweet_id: - if self._user_obj is None: - self._assign_user(tweet["core"]["user_results"]["result"]) - tweets.append(tweet) + tweet = self.api.tweet_result_by_rest_id(tweet_id) + self._assign_user(tweet["core"]["user_results"]["result"]) - tweet_id = tweet["legacy"].get("quoted_status_id_str") - if not tweet_id: - break + while True: + tweets.append(tweet) + tweet_id = tweet["legacy"].get("quoted_status_id_str") + if not tweet_id: + break + tweet = self.api.tweet_result_by_rest_id(tweet_id) return tweets @@ -1179,6 +1178,46 @@ def __init__(self, extractor): "responsive_web_enhance_cards_enabled": False, } + def tweet_result_by_rest_id(self, tweet_id): + endpoint = "/graphql/2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId" + params = { + "variables": self._json_dumps({ + "tweetId": tweet_id, + "withCommunity": False, + "includePromotedContent": False, + "withVoice": False, + }), + "features": self._json_dumps({ + "creator_subscriptions_tweet_preview_api_enabled": True, + "tweetypie_unmention_optimization_enabled": True, + "responsive_web_edit_tweet_api_enabled": True, + "graphql_is_translatable_rweb_tweet_is_translatable_enabled": + True, + "view_counts_everywhere_api_enabled": True, + "longform_notetweets_consumption_enabled": True, + "responsive_web_twitter_article_tweet_consumption_enabled": + False, + "tweet_awards_web_tipping_enabled": False, + "freedom_of_speech_not_reach_fetch_enabled": True, + "standardized_nudges_misinfo": True, + "tweet_with_visibility_results_prefer_gql_" + "limited_actions_policy_enabled": True, + "longform_notetweets_rich_text_read_enabled": True, + "longform_notetweets_inline_media_enabled": True, + "responsive_web_graphql_exclude_directive_enabled": True, + "verified_phone_label_enabled": False, + "responsive_web_media_download_video_enabled": False, + "responsive_web_graphql_skip_user_profile_" + "image_extensions_enabled": False, + "responsive_web_graphql_timeline_navigation_enabled": True, + "responsive_web_enhance_cards_enabled": False, + }), + "fieldToggles": self._json_dumps({ + "withArticleRichContentState": False, + }), + } + return self._call(endpoint, params)["data"]["tweetResult"]["result"] + def tweet_detail(self, tweet_id): endpoint = "/graphql/JlLZj42Ltr2qwjasw-l5lQ/TweetDetail" variables = { diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 39cfbd1c5d..bb5c28b3c9 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.0-dev" +__version__ = "1.25.8-dev" From a1ffa1ff09c6f2761b7a42d65e5dfec366d670d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 8 Jul 2023 23:11:27 +0200 Subject: [PATCH 201/252] [philomena] fix '--range' (#4288) --- gallery_dl/extractor/philomena.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index df85b964b8..24454ef115 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -19,6 +19,7 @@ class PhilomenaExtractor(BooruExtractor): filename_fmt = "{filename}.{extension}" archive_fmt = "{id}" request_interval = 1.0 + page_start = 1 per_page = 50 _file_url = operator.itemgetter("view_url") @@ -28,7 +29,7 @@ def _prepare(post): post["date"] = text.parse_datetime(post["created_at"]) def _pagination(self, url, params): - params["page"] = 1 + params["page"] = self.page_start params["per_page"] = self.per_page api_key = self.config("api-key") From a2111dd025902b1397349439030306eaac32b40e Mon Sep 17 00:00:00 2001 From: enduser420 <91022934+enduser420@users.noreply.github.com> Date: Sun, 9 Jul 2023 12:48:47 +0530 Subject: [PATCH 202/252] [wikifeet] fix 'tag' extraction --- gallery_dl/extractor/wikifeet.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/wikifeet.py b/gallery_dl/extractor/wikifeet.py index 662e08b9e1..5f02e94ad8 100644 --- a/gallery_dl/extractor/wikifeet.py +++ b/gallery_dl/extractor/wikifeet.py @@ -32,7 +32,7 @@ class WikifeetGalleryExtractor(GalleryExtractor): "pid" : int, "width" : int, "height" : int, - "shoesize" : "7.5 US", + "shoesize" : "9 US", "type" : "women", "tags" : list, }, @@ -50,7 +50,7 @@ class WikifeetGalleryExtractor(GalleryExtractor): "pid" : int, "width" : int, "height" : int, - "shoesize" : "[NOT SET]", + "shoesize" : "4 US", "type" : "women", "tags" : list, }, @@ -111,7 +111,10 @@ def images(self, page): "pid" : data["pid"], "width" : data["pw"], "height": data["ph"], - "tags" : [tagmap[tag] for tag in data["tags"]], + "tags" : [ + tagmap[tag] + for tag in data["tags"] if tag in tagmap + ], }) for data in util.json_loads(text.extr(page, "['gdata'] = ", ";")) ] From 1590124aae8fe13e7923b17ffc8dadbe6dd23960 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 10 Jul 2023 14:12:56 +0200 Subject: [PATCH 203/252] [twibooru] fix '--range' --- gallery_dl/extractor/twibooru.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py index 30bf2f15fb..a8acd319ca 100644 --- a/gallery_dl/extractor/twibooru.py +++ b/gallery_dl/extractor/twibooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2022 Mike Fährmann +# Copyright 2022-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,6 +22,7 @@ class TwibooruExtractor(BooruExtractor): filename_fmt = "{id}_{filename}.{extension}" archive_fmt = "{id}" request_interval = 6.05 + page_start = 1 per_page = 50 root = "https://twibooru.org" @@ -230,7 +231,7 @@ def _pagination(self, endpoint, params): elif not api_key: params["filter_id"] = "2" - params["page"] = 1 + params["page"] = extr.page_start params["per_page"] = per_page = extr.per_page while True: From f6553ffd2f2fb515be35c50dae4e303fa252a954 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 10 Jul 2023 14:39:09 +0200 Subject: [PATCH 204/252] [twitter] simplify '_pagination_users' - remove 'stop' variable - call 'cursor.startswith()' only once --- gallery_dl/extractor/twitter.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ff2e858c64..2183fdc5bb 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1733,7 +1733,7 @@ def _pagination_users(self, endpoint, variables, path=None): "features" : self._json_dumps(self.features_pagination)} while True: - cursor = entry = stop = None + cursor = entry = None params["variables"] = self._json_dumps(variables) data = self._call(endpoint, params)["data"] @@ -1762,11 +1762,8 @@ def _pagination_users(self, endpoint, variables, path=None): yield user elif entry["entryId"].startswith("cursor-bottom-"): cursor = entry["content"]["value"] - if cursor.startswith("-1|") or \ - cursor.startswith("0|"): - stop = True - if stop or not cursor or not entry: + if not cursor or cursor.startswith(("-1|", "0|")) or not entry: return variables["cursor"] = cursor From f079d9a703f347b0142696b4bdd80d5df5fc34db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 12 Jul 2023 21:43:00 +0200 Subject: [PATCH 205/252] [reddit] notify users about registering an oauth application (#4292, #4253, #3943) --- docs/gallery-dl.conf | 3 ++ gallery_dl/extractor/reddit.py | 63 +++++++++++++++++++++++----------- 2 files changed, 46 insertions(+), 20 deletions(-) diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 4a08eb8cb8..b5efc73459 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -261,6 +261,9 @@ }, "reddit": { + "client-id": null, + "user-agent": null, + "refresh-token": null, "comments": 0, "morecomments": false, "date-min": 0, diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 9a57dcfe8b..af79a7bd0e 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -377,6 +377,18 @@ def __init__(self, extractor): self.client_id = client_id self.headers = {"User-Agent": config("user-agent")} + if self.client_id == self.CLIENT_ID: + client_id = self.client_id + self._warn_429 = True + kind = "default" + else: + client_id = client_id[:5] + "*" * (len(client_id)-5) + self._warn_429 = False + kind = "custom" + + self.log.debug( + "Using %s API credentials (client-id %s)", kind, client_id) + token = config("refresh-token") if token is None or token == "cache": key = "#" + self.client_id @@ -463,28 +475,39 @@ def _authenticate_impl(self, refresh_token=None): def _call(self, endpoint, params): url = "https://oauth.reddit.com" + endpoint params["raw_json"] = "1" - self.authenticate() - response = self.extractor.request( - url, params=params, headers=self.headers, fatal=None) - remaining = response.headers.get("x-ratelimit-remaining") - if remaining and float(remaining) < 2: - self.extractor.wait(seconds=response.headers["x-ratelimit-reset"]) - return self._call(endpoint, params) + while True: + self.authenticate() + response = self.extractor.request( + url, params=params, headers=self.headers, fatal=None) + + remaining = response.headers.get("x-ratelimit-remaining") + if remaining and float(remaining) < 2: + if self._warn_429: + self._warn_429 = False + self.log.info( + "Register your own OAuth application and use its " + "credentials to prevent this error: " + "https://github.com/mikf/gallery-dl/blob/master" + "/docs/configuration.rst" + "#extractorredditclient-id--user-agent") + self.extractor.wait( + seconds=response.headers["x-ratelimit-reset"]) + continue - try: - data = response.json() - except ValueError: - raise exception.StopExtraction(text.remove_html(response.text)) - - if "error" in data: - if data["error"] == 403: - raise exception.AuthorizationError() - if data["error"] == 404: - raise exception.NotFoundError() - self.log.debug(data) - raise exception.StopExtraction(data.get("message")) - return data + try: + data = response.json() + except ValueError: + raise exception.StopExtraction(text.remove_html(response.text)) + + if "error" in data: + if data["error"] == 403: + raise exception.AuthorizationError() + if data["error"] == 404: + raise exception.NotFoundError() + self.log.debug(data) + raise exception.StopExtraction(data.get("message")) + return data def _pagination(self, endpoint, params): id_min = self._parse_id("id-min", 0) From fc43c7469488e9400101c02a859867ec17bafd35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 13 Jul 2023 15:32:21 +0200 Subject: [PATCH 206/252] [docs] update reddit client-id instructions (#4292) add clearing cached access tokens and getting a new refresh token --- docs/configuration.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index 75033307c4..1868fa788c 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -5114,6 +5114,10 @@ How To ``user-agent`` and replace ``<application name>`` and ``<username>`` accordingly (see Reddit's `API access rules <https://github.com/reddit/reddit/wiki/API>`__) + * clear your `cache <cache.file_>`__ to delete any remaining + ``access-token`` entries. (``gallery-dl --clear-cache reddit``) + * get a `refresh-token <extractor.reddit.refresh-token_>`__ for the + new ``client-id`` (``gallery-dl oauth:reddit``) extractor.smugmug.api-key & .api-secret From fceabee43338eafc446444e574924d36f3e386e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 13 Jul 2023 18:08:40 +0200 Subject: [PATCH 207/252] [philomena] use API interface class handle 429 errors and retry after 10min (#4288) --- gallery_dl/extractor/philomena.py | 115 ++++++++++++++++++++---------- 1 file changed, 79 insertions(+), 36 deletions(-) diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index 24454ef115..e7188285cb 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -22,37 +22,16 @@ class PhilomenaExtractor(BooruExtractor): page_start = 1 per_page = 50 + def __init__(self, match): + BooruExtractor.__init__(self, match) + self.api = PhilomenaAPI(self) + _file_url = operator.itemgetter("view_url") @staticmethod def _prepare(post): post["date"] = text.parse_datetime(post["created_at"]) - def _pagination(self, url, params): - params["page"] = self.page_start - params["per_page"] = self.per_page - - api_key = self.config("api-key") - if api_key: - params["key"] = api_key - - filter_id = self.config("filter") - if filter_id: - params["filter_id"] = filter_id - elif not api_key: - try: - params["filter_id"] = INSTANCES[self.category]["filter_id"] - except (KeyError, TypeError): - params["filter_id"] = "2" - - while True: - data = self.request(url, params=params).json() - yield from data["images"] - - if len(data["images"]) < self.per_page: - return - params["page"] += 1 - INSTANCES = { "derpibooru": { @@ -147,8 +126,7 @@ def __init__(self, match): self.image_id = match.group(match.lastindex) def posts(self): - url = self.root + "/api/v1/json/images/" + self.image_id - return (self.request(url).json()["image"],) + return (self.api.image(self.image_id),) class PhilomenaSearchExtractor(PhilomenaExtractor): @@ -202,8 +180,7 @@ def metadata(self): return {"search_tags": self.params.get("q", "")} def posts(self): - url = self.root + "/api/v1/json/search/images" - return self._pagination(url, self.params) + return self.api.search(self.params) class PhilomenaGalleryExtractor(PhilomenaExtractor): @@ -240,15 +217,81 @@ def __init__(self, match): self.gallery_id = match.group(match.lastindex) def metadata(self): - url = self.root + "/api/v1/json/search/galleries" - params = {"q": "id:" + self.gallery_id} - galleries = self.request(url, params=params).json()["galleries"] - if not galleries: + try: + return {"gallery": self.api.gallery(self.gallery_id)} + except IndexError: raise exception.NotFoundError("gallery") - return {"gallery": galleries[0]} def posts(self): gallery_id = "gallery_id:" + self.gallery_id - url = self.root + "/api/v1/json/search/images" params = {"sd": "desc", "sf": gallery_id, "q": gallery_id} - return self._pagination(url, params) + return self.api.search(params) + + +class PhilomenaAPI(): + """Interface for the Philomena API + + https://www.derpibooru.org/pages/api + """ + + def __init__(self, extractor): + self.extractor = extractor + self.root = extractor.root + "/api" + + def gallery(self, gallery_id): + endpoint = "/v1/json/search/galleries" + params = {"q": "id:" + gallery_id} + return self._call(endpoint, params)["galleries"][0] + + def image(self, image_id): + endpoint = "/v1/json/images/" + image_id + return self._call(endpoint)["image"] + + def search(self, params): + endpoint = "/v1/json/search/images" + return self._pagination(endpoint, params) + + def _call(self, endpoint, params=None): + url = self.root + endpoint + + while True: + response = self.extractor.request(url, params=params, fatal=None) + + if response.status_code < 400: + return response.json() + + if response.status_code == 429: + self.extractor.wait(seconds=600) + continue + + # error + self.extractor.log.debug(response.content) + raise exception.StopExtraction( + "%s %s", response.status_code, response.reason) + + def _pagination(self, endpoint, params): + extr = self.extractor + + api_key = extr.config("api-key") + if api_key: + params["key"] = api_key + + filter_id = extr.config("filter") + if filter_id: + params["filter_id"] = filter_id + elif not api_key: + try: + params["filter_id"] = INSTANCES[extr.category]["filter_id"] + except (KeyError, TypeError): + params["filter_id"] = "2" + + params["page"] = extr.page_start + params["per_page"] = extr.per_page + + while True: + data = self._call(endpoint, params) + yield from data["images"] + + if len(data["images"]) < extr.per_page: + return + params["page"] += 1 From 86560fe0cd7518171b1802df5421e510c4fdfd01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 13 Jul 2023 18:32:55 +0200 Subject: [PATCH 208/252] [bcy] remove module "The website was shut down on July 12, 2023" https://danbooru.donmai.us/wiki_pages/bcy --- docs/supportedsites.md | 6 - gallery_dl/extractor/__init__.py | 1 - gallery_dl/extractor/bcy.py | 206 ------------------------------- scripts/supportedsites.py | 1 - 4 files changed, 214 deletions(-) delete mode 100644 gallery_dl/extractor/bcy.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a3c0ee10bd..0ffb951109 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1009,12 +1009,6 @@ Consider all sites to be NSFW unless otherwise known. <td>Posts, Tag Searches</td> <td></td> </tr> -<tr> - <td>半次元</td> - <td>https://bcy.net/</td> - <td>Posts, User Profiles</td> - <td></td> -</tr> <tr> <td colspan="4"><strong>Danbooru Instances</strong></td> diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index a344fe42ff..062305aa1d 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -24,7 +24,6 @@ "artstation", "aryion", "bbc", - "bcy", "behance", "blogger", "bunkr", diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py deleted file mode 100644 index d6adb4eb17..0000000000 --- a/gallery_dl/extractor/bcy.py +++ /dev/null @@ -1,206 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2020-2023 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://bcy.net/""" - -from .common import Extractor, Message -from .. import text, util, exception -import re - - -class BcyExtractor(Extractor): - """Base class for bcy extractors""" - category = "bcy" - directory_fmt = ("{category}", "{user[id]} {user[name]}") - filename_fmt = "{post[id]} {id}.{extension}" - archive_fmt = "{post[id]}_{id}" - root = "https://bcy.net" - - def __init__(self, match): - Extractor.__init__(self, match) - self.item_id = match.group(1) - self.session.headers["Referer"] = self.root + "/" - - def items(self): - sub = re.compile(r"^https?://p\d+-bcy" - r"(?:-sign\.bcyimg\.com|\.byteimg\.com/img)" - r"/banciyuan").sub - iroot = "https://img-bcy-qn.pstatp.com" - noop = self.config("noop") - - for post in self.posts(): - if not post["image_list"]: - continue - - multi = None - tags = post.get("post_tags") or () - data = { - "user": { - "id" : post["uid"], - "name" : post["uname"], - "avatar" : sub(iroot, post["avatar"].partition("~")[0]), - }, - "post": { - "id" : text.parse_int(post["item_id"]), - "tags" : [t["tag_name"] for t in tags], - "date" : text.parse_timestamp(post["ctime"]), - "parody" : post["work"], - "content": post["plain"], - "likes" : post["like_count"], - "shares" : post["share_count"], - "replies": post["reply_count"], - }, - } - - yield Message.Directory, data - for data["num"], image in enumerate(post["image_list"], 1): - data["id"] = image["mid"] - data["width"] = image["w"] - data["height"] = image["h"] - - url = image["path"].partition("~")[0] - text.nameext_from_url(url, data) - - # full-resolution image without watermark - if data["extension"]: - if not url.startswith(iroot): - url = sub(iroot, url) - data["filter"] = "" - yield Message.Url, url, data - - # watermarked image & low quality noop filter - else: - if multi is None: - multi = self._data_from_post( - post["item_id"])["post_data"]["multi"] - image = multi[data["num"] - 1] - - if image["origin"]: - data["filter"] = "watermark" - yield Message.Url, image["origin"], data - - if noop: - data["extension"] = "" - data["filter"] = "noop" - yield Message.Url, image["original_path"], data - - def posts(self): - """Returns an iterable with all relevant 'post' objects""" - - def _data_from_post(self, post_id): - url = "{}/item/detail/{}".format(self.root, post_id) - page = self.request(url, notfound="post").text - data = (text.extr(page, 'JSON.parse("', '");') - .replace('\\\\u002F', '/') - .replace('\\"', '"')) - try: - return util.json_loads(data)["detail"] - except ValueError: - return util.json_loads(data.replace('\\"', '"'))["detail"] - - -class BcyUserExtractor(BcyExtractor): - """Extractor for user timelines""" - subcategory = "user" - pattern = r"(?:https?://)?bcy\.net/u/(\d+)" - test = ( - ("https://bcy.net/u/1933712", { - "pattern": r"https://img-bcy-qn.pstatp.com/\w+/\d+/post/\w+/.+jpg", - "count": ">= 20", - }), - ("https://bcy.net/u/109282764041", { - "pattern": r"https://p\d-bcy-sign\.bcyimg\.com/banciyuan/[0-9a-f]+" - r"~tplv-bcyx-yuan-logo-v1:.+\.image", - "range": "1-25", - "count": 25, - }), - ) - - def posts(self): - url = self.root + "/apiv3/user/selfPosts" - params = {"uid": self.item_id, "since": None} - - while True: - data = self.request(url, params=params).json() - - try: - items = data["data"]["items"] - except KeyError: - return - if not items: - return - - for item in items: - yield item["item_detail"] - params["since"] = item["since"] - - -class BcyPostExtractor(BcyExtractor): - """Extractor for individual posts""" - subcategory = "post" - pattern = r"(?:https?://)?bcy\.net/item/detail/(\d+)" - test = ( - ("https://bcy.net/item/detail/6355835481002893070", { - "url": "301202375e61fd6e0e2e35de6c3ac9f74885dec3", - "count": 1, - "keyword": { - "user": { - "id" : 1933712, - "name" : "wukloo", - "avatar" : "re:https://img-bcy-qn.pstatp.com/Public/", - }, - "post": { - "id" : 6355835481002893070, - "tags" : list, - "date" : "dt:2016-11-22 08:47:46", - "parody" : "东方PROJECT", - "content": "re:根据微博的建议稍微做了点修改", - "likes" : int, - "shares" : int, - "replies": int, - }, - "id": 8330182, - "num": 1, - "width" : 3000, - "height": 1687, - "filename": "712e0780b09011e696f973c3d1568337", - "extension": "jpg", - }, - }), - # only watermarked images available - ("https://bcy.net/item/detail/6950136331708144648", { - "pattern": r"https://p\d-bcy-sign\.bcyimg\.com/banciyuan/[0-9a-f]+" - r"~tplv-bcyx-yuan-logo-v1:.+\.image", - "count": 10, - "keyword": {"filter": "watermark"}, - }), - # deleted - ("https://bcy.net/item/detail/6780546160802143237", { - "exception": exception.NotFoundError, - "count": 0, - }), - # only visible to logged in users - ("https://bcy.net/item/detail/6747523535150783495", { - "count": 0, - }), - # JSON decode error (#3321) - ("https://bcy.net/item/detail/7166939271872388110", { - "count": 0, - }), - ) - - def posts(self): - try: - data = self._data_from_post(self.item_id) - except KeyError: - return () - post = data["post_data"] - post["image_list"] = post["multi"] - post["plain"] = text.parse_unicode_escapes(post["plain"]) - post.update(data["detail_user"]) - return (post,) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index fb6ffa7ba7..f90af3e3a9 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -28,7 +28,6 @@ "b4k" : "arch.b4k.co", "baraag" : "baraag", "bbc" : "BBC", - "bcy" : "半次元", "comicvine" : "Comic Vine", "coomerparty" : "Coomer", "deviantart" : "DeviantArt", From 1137b89ed4417d376f4b2513ae94ef002a12c13a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 13 Jul 2023 20:28:05 +0200 Subject: [PATCH 209/252] [lineblog] remove module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "LINE BLOGは2023年6月29日をもちましてサービスを終了いたしました" --- docs/supportedsites.md | 6 --- gallery_dl/extractor/__init__.py | 1 - gallery_dl/extractor/lineblog.py | 73 -------------------------------- scripts/supportedsites.py | 1 - 4 files changed, 81 deletions(-) delete mode 100644 gallery_dl/extractor/lineblog.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0ffb951109..3183ab5162 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -481,12 +481,6 @@ Consider all sites to be NSFW unless otherwise known. <td>Galleries</td> <td></td> </tr> -<tr> - <td>LINE BLOG</td> - <td>https://www.lineblog.me/</td> - <td>Blogs, Posts</td> - <td></td> -</tr> <tr> <td>livedoor Blog</td> <td>http://blog.livedoor.jp/</td> diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 062305aa1d..fa56bfb45d 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -84,7 +84,6 @@ "lensdump", "lexica", "lightroom", - "lineblog", "livedoor", "luscious", "lynxchan", diff --git a/gallery_dl/extractor/lineblog.py b/gallery_dl/extractor/lineblog.py deleted file mode 100644 index adb27a805c..0000000000 --- a/gallery_dl/extractor/lineblog.py +++ /dev/null @@ -1,73 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2019-2020 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://www.lineblog.me/""" - -from .livedoor import LivedoorBlogExtractor, LivedoorPostExtractor -from .. import text - - -class LineblogBase(): - """Base class for lineblog extractors""" - category = "lineblog" - root = "https://lineblog.me" - - def _images(self, post): - imgs = [] - body = post.pop("body") - - for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1): - src = text.extr(img, 'src="', '"') - alt = text.extr(img, 'alt="', '"') - - if not src: - continue - if src.startswith("https://obs.line-scdn.") and src.count("/") > 3: - src = src.rpartition("/")[0] - - imgs.append(text.nameext_from_url(alt or src, { - "url" : src, - "num" : num, - "hash": src.rpartition("/")[2], - "post": post, - })) - - return imgs - - -class LineblogBlogExtractor(LineblogBase, LivedoorBlogExtractor): - """Extractor for a user's blog on lineblog.me""" - pattern = r"(?:https?://)?lineblog\.me/(\w+)/?(?:$|[?#])" - test = ("https://lineblog.me/mamoru_miyano/", { - "range": "1-20", - "count": 20, - "pattern": r"https://obs.line-scdn.net/[\w-]+$", - "keyword": { - "post": { - "categories" : tuple, - "date" : "type:datetime", - "description": str, - "id" : int, - "tags" : list, - "title" : str, - "user" : "mamoru_miyano" - }, - "filename": str, - "hash" : r"re:\w{32,}", - "num" : int, - }, - }) - - -class LineblogPostExtractor(LineblogBase, LivedoorPostExtractor): - """Extractor for blog posts on lineblog.me""" - pattern = r"(?:https?://)?lineblog\.me/(\w+)/archives/(\d+)" - test = ("https://lineblog.me/mamoru_miyano/archives/1919150.html", { - "url": "24afeb4044c554f80c374b52bf8109c6f1c0c757", - "keyword": "76a38e2c0074926bd3362f66f9fc0e6c41591dcb", - }) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index f90af3e3a9..c25a21bdae 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -66,7 +66,6 @@ "jpgfish" : "JPG Fish", "kabeuchi" : "かべうち", "kemonoparty" : "Kemono", - "lineblog" : "LINE BLOG", "livedoor" : "livedoor Blog", "ohpolly" : "Oh Polly", "omgmiamiswimwear": "Omg Miami Swimwear", From 2dd6942d1cacef68a9a95c9bf27b42c6f6034079 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 13 Jul 2023 23:21:01 +0200 Subject: [PATCH 210/252] [jpgfish] update domain to 'jpeg.pet' --- docs/supportedsites.md | 2 +- gallery_dl/extractor/jpgfish.py | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3183ab5162..570f038b65 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -435,7 +435,7 @@ Consider all sites to be NSFW unless otherwise known. </tr> <tr> <td>JPG Fish</td> - <td>https://jpg.pet/</td> + <td>https://jpeg.pet/</td> <td>Albums, individual Images, User Profiles</td> <td></td> </tr> diff --git a/gallery_dl/extractor/jpgfish.py b/gallery_dl/extractor/jpgfish.py index b8d425a865..39208e5a84 100644 --- a/gallery_dl/extractor/jpgfish.py +++ b/gallery_dl/extractor/jpgfish.py @@ -4,18 +4,18 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://jpg.pet/""" +"""Extractors for https://jpeg.pet/""" from .common import Extractor, Message from .. import text -BASE_PATTERN = r"(?:https?://)?jpg\.(?:pet|fish(?:ing)?|church)" +BASE_PATTERN = r"(?:https?://)?jpe?g\.(?:pet|fish(?:ing)?|church)" class JpgfishExtractor(Extractor): """Base class for jpgfish extractors""" category = "jpgfish" - root = "https://jpg.pet" + root = "https://jpeg.pet" directory_fmt = ("{category}", "{user}", "{album}",) archive_fmt = "{id}" @@ -36,7 +36,7 @@ class JpgfishImageExtractor(JpgfishExtractor): subcategory = "image" pattern = BASE_PATTERN + r"/img/((?:[^/?#]+\.)?(\w+))" test = ( - ("https://jpg.pet/img/funnymeme.LecXGS", { + ("https://jpeg.pet/img/funnymeme.LecXGS", { "pattern": r"https://simp3\.jpg\.church/images/funnymeme\.jpg", "content": "098e5e9b17ad634358426e0ffd1c93871474d13c", "keyword": { @@ -52,6 +52,7 @@ class JpgfishImageExtractor(JpgfishExtractor): "pattern": r"https://simp2\.jpg\.church/hannahowo_00457\.jpg", "keyword": {"album": "401-500"}, }), + ("https://jpg.pet/img/funnymeme.LecXGS"), ("https://jpg.fishing/img/funnymeme.LecXGS"), ("https://jpg.fish/img/funnymeme.LecXGS"), ("https://jpg.church/img/funnymeme.LecXGS"), @@ -83,7 +84,7 @@ class JpgfishAlbumExtractor(JpgfishExtractor): subcategory = "album" pattern = BASE_PATTERN + r"/a(?:lbum)?/([^/?#]+)(/sub)?" test = ( - ("https://jpg.pet/album/CDilP/?sort=date_desc&page=1", { + ("https://jpeg.pet/album/CDilP/?sort=date_desc&page=1", { "count": 2, }), ("https://jpg.fishing/a/gunggingnsk.N9OOI", { @@ -95,6 +96,7 @@ class JpgfishAlbumExtractor(JpgfishExtractor): ("https://jpg.church/a/hannahowo.aNTdH/sub", { "count": 606, }), + ("https://jpg.pet/album/CDilP/?sort=date_desc&page=1"), ) def __init__(self, match): @@ -120,12 +122,13 @@ class JpgfishUserExtractor(JpgfishExtractor): subcategory = "user" pattern = BASE_PATTERN + r"/(?!img|a(?:lbum)?)([^/?#]+)(/albums)?" test = ( - ("https://jpg.pet/exearco", { + ("https://jpeg.pet/exearco", { "count": 3, }), ("https://jpg.church/exearco/albums", { "count": 1, }), + ("https://jpg.pet/exearco"), ("https://jpg.fishing/exearco"), ("https://jpg.fish/exearco"), ("https://jpg.church/exearco"), From 75757c4ace8a0a01ca9b66b0e31db0145073e3e7 Mon Sep 17 00:00:00 2001 From: ncaat <81056024+ncaat@users.noreply.github.com> Date: Fri, 14 Jul 2023 12:53:24 +0200 Subject: [PATCH 211/252] [gelbooru_v01] fix 'source' (#4302) --- gallery_dl/extractor/gelbooru_v01.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py index c4f32a4b0f..1ea2078262 100644 --- a/gallery_dl/extractor/gelbooru_v01.py +++ b/gallery_dl/extractor/gelbooru_v01.py @@ -27,7 +27,7 @@ def _parse_post(self, post_id): "uploader" : extr('By: ', ' <'), "width" : extr('Size: ', 'x'), "height" : extr('', ' <'), - "source" : extr('Source: <a href="', '"'), + "source" : extr('Source: ', ' <'), "rating" : (extr('Rating: ', '<') or "?")[0].lower(), "score" : extr('Score: ', ' <'), "file_url" : extr('<img alt="img" src="', '"'), From 20da41018d269817d12722ed6c2d73eb0565eefb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 14 Jul 2023 14:30:27 +0200 Subject: [PATCH 212/252] [pornhub] set 'accessAgeDisclaimerPH' cookie (#4301) --- gallery_dl/extractor/pornhub.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index f19e33c37f..fa4efa027c 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -58,6 +58,9 @@ def __init__(self, match): self._first = None def items(self): + self.session.cookies.set( + "accessAgeDisclaimerPH", "1", domain=".pornhub.com") + data = self.metadata() yield Message.Directory, data for num, image in enumerate(self.images(), 1): From c6b31a21696855b5ea9f0df339a51e1e14d87cd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 14 Jul 2023 14:41:16 +0200 Subject: [PATCH 213/252] [reddit] set default 0.6s delay between requests (#4292) to limit API requests to 100 per minute https://www.reddit.com/r/redditdev/comments/14nbw6g/ --- gallery_dl/extractor/reddit.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index af79a7bd0e..54b162b37b 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -20,6 +20,7 @@ class RedditExtractor(Extractor): filename_fmt = "{id}{num:? //>02} {title[:220]}.{extension}" archive_fmt = "{filename}" cookiedomain = ".reddit.com" + request_interval = 0.6 def items(self): self.api = RedditAPI(self) From ffbbbd3baf5ded9a69c63d5222d9837e15a33634 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 14 Jul 2023 15:09:39 +0200 Subject: [PATCH 214/252] [gelbooru_v01] 'vidyart' -> 'vidyart2' --- docs/supportedsites.md | 4 ++-- gallery_dl/extractor/gelbooru_v01.py | 12 ++++++------ scripts/supportedsites.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 570f038b65..c1f25d0681 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1070,8 +1070,8 @@ Consider all sites to be NSFW unless otherwise known. <td></td> </tr> <tr> - <td>/v/idyart</td> - <td>https://vidyart.booru.org/</td> + <td>/v/idyart2</td> + <td>https://vidyart2.booru.org/</td> <td>Favorites, Posts, Tag Searches</td> <td></td> </tr> diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py index 1ea2078262..b6fbcb6d17 100644 --- a/gallery_dl/extractor/gelbooru_v01.py +++ b/gallery_dl/extractor/gelbooru_v01.py @@ -78,9 +78,9 @@ def _pagination(self, url, begin, end): "root": "https://drawfriends.booru.org", "pattern": r"drawfriends\.booru\.org", }, - "vidyart": { - "root": "https://vidyart.booru.org", - "pattern": r"vidyart\.booru\.org", + "vidyart2": { + "root": "https://vidyart2.booru.org", + "pattern": r"vidyart2\.booru\.org", }, }) @@ -106,7 +106,7 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor): "count": 25, }), ("https://drawfriends.booru.org/index.php?page=post&s=list&tags=all"), - ("https://vidyart.booru.org/index.php?page=post&s=list&tags=all"), + ("https://vidyart2.booru.org/index.php?page=post&s=list&tags=all"), ) def __init__(self, match): @@ -141,7 +141,7 @@ class GelbooruV01FavoriteExtractor(GelbooruV01Extractor): "count": 4, }), ("https://drawfriends.booru.org/index.php?page=favorites&s=view&id=1"), - ("https://vidyart.booru.org/index.php?page=favorites&s=view&id=1"), + ("https://vidyart2.booru.org/index.php?page=favorites&s=view&id=1"), ) def __init__(self, match): @@ -193,7 +193,7 @@ class GelbooruV01PostExtractor(GelbooruV01Extractor): }, }), ("https://drawfriends.booru.org/index.php?page=post&s=view&id=107474"), - ("https://vidyart.booru.org/index.php?page=post&s=view&id=383111"), + ("https://vidyart2.booru.org/index.php?page=post&s=view&id=39168"), ) def __init__(self, match): diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index c25a21bdae..c9fc0ab5cc 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -119,7 +119,7 @@ "thecollection" : "The /co/llection", "tumblrgallery" : "TumblrGallery", "vanillarock" : "もえぴりあ", - "vidyart" : "/v/idyart", + "vidyart2" : "/v/idyart2", "vk" : "VK", "vsco" : "VSCO", "wallpapercave" : "Wallpaper Cave", From c84397023aee9fdaae95624caadeab2ae924fd64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 14 Jul 2023 21:52:53 +0200 Subject: [PATCH 215/252] [slideshare] fix extraction --- gallery_dl/extractor/slideshare.py | 59 ++++++++++++------------------ 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index bea457f4b5..3521298f6d 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -30,21 +30,20 @@ class SlidesharePresentationExtractor(GalleryExtractor): "count": 19, "content": "2b6a191eab60b3978fdacfecf2da302dd45bc108", "keyword": { - "comments": "0", "description": "Get Started with SlideShare - " "A Beginngers Guide for Creators", - "likes": r"re:\d{3,}", + "likes": int, "presentation": "get-started-with-slide-share", - "published": "dt:2015-05-20 00:00:00", + "date": "dt:2015-05-20 17:38:21", "title": "Getting Started With SlideShare", "user": "Slideshare", - "views": r"re:\d{7,}", + "views": int, }, }), # long title and description (("https://www.slideshare.net/pragmaticsolutions/warum-sie-nicht-ihren" "-mitarbeitenden-ndern-sollten-sondern-ihr-managementsystem"), { - "url": "cf70ca99f57f61affab47ebf8583eb564b21e3a7", + "url": "d8952260f8bec337dd809a958ec8091350393f6b", "keyword": { "title": "Warum Sie nicht Ihren Mitarbeitenden ändern " "sollten, sondern Ihr Managementsystem", @@ -58,7 +57,7 @@ class SlidesharePresentationExtractor(GalleryExtractor): # mobile URL (("https://www.slideshare.net" "/mobile/uqudent/introduction-to-fixed-prosthodontics"), { - "url": "43eda2adf4dd221a251c8df794dfb82649e94647", + "url": "72c431cb1eccbb6794f608ecbbc01d52e8768159", }), ) @@ -69,43 +68,31 @@ def __init__(self, match): GalleryExtractor.__init__(self, match, url) def metadata(self, page): - extr = text.extract_from(page) - descr = extr('<meta name="description" content="', '"') - comments = extr('content="UserComments:', '"') - likes = extr('content="UserLikes:', '"') - views = extr('content="UserPageVisits:', '"') - title = extr('<span class="j-title-breadcrumb">', '</span>') - published = extr('<div class="metadata-item">', '</div>') - - if descr.endswith("…"): - alt_descr = extr('slideshow-description-text"', '</p>') - if alt_descr: - descr = text.remove_html(alt_descr.partition(">")[2]).strip() + data = util.json_loads(text.extr( + page, 'id="__NEXT_DATA__" type="application/json">', '</script>')) + self.slideshow = slideshow = data["props"]["pageProps"]["slideshow"] return { - "user": self.user, + "user" : slideshow["username"], "presentation": self.presentation, - "title": text.unescape(title.strip()), - "description": text.unescape(descr), - "views": views, - "likes": likes, - "comments": comments, - "published": text.parse_datetime( - published.strip(), "%b. %d, %Y"), + "title" : slideshow["title"].strip(), + "description" : slideshow["description"].strip(), + "views" : slideshow["views"], + "likes" : slideshow["likes"], + "date" : text.parse_datetime( + slideshow["createdAt"], "%Y-%m-%d %H:%M:%S %Z"), } - @staticmethod - def images(page): - data = util.json_loads(text.extract( - page, "xtend(true, slideshare_object.slideshow_config, ", ");")[0]) + def images(self, page): + parts = self.slideshow["slideImages"][0]["baseUrl"].split("/") - # useing 'stripped_title' here is technically wrong, but it works all - # the same, slideshare doesn't seem to care what characters go there - begin = "https://image.slidesharecdn.com/{}/95/{}-".format( - data["ppt_location"], data["stripped_title"]) - end = "-1024.jpg?cb=" + str(data["timestamp"]) + begin = "{}/95/{}-".format( + "/".join(parts[:4]), + self.slideshow["strippedTitle"], + ) + end = "-1024.jpg?" + parts[-1].rpartition("?")[2] return [ (begin + str(n) + end, None) - for n in range(1, data["slide_count"]+1) + for n in range(1, self.slideshow["totalSlides"]+1) ] From c9a82c9313bc7f611e8e460cff2f9e84dc0399f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 14 Jul 2023 22:21:02 +0200 Subject: [PATCH 216/252] [erome] ignore duplicate album IDs --- gallery_dl/extractor/erome.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 03307f892e..709bc5769b 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -80,7 +80,7 @@ def _pagination(self, url, params): for params["page"] in itertools.count(1): page = self.request(url, params=params).text - album_ids = EromeAlbumExtractor.pattern.findall(page) + album_ids = EromeAlbumExtractor.pattern.findall(page)[::2] yield from album_ids if len(album_ids) < 36: From ab5dde72212e90372221e546cf0f14830194c3fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 14 Jul 2023 22:36:06 +0200 Subject: [PATCH 217/252] [mangaread] fix 'tags' extraction --- gallery_dl/extractor/mangaread.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py index 49d4d7d6ff..74c239e9b0 100644 --- a/gallery_dl/extractor/mangaread.py +++ b/gallery_dl/extractor/mangaread.py @@ -87,7 +87,8 @@ class MangareadChapterExtractor(MangareadBase, ChapterExtractor): ) def metadata(self, page): - data = {"tags": list(text.extract_iter(page, "class>", "<"))} + tags = text.extr(page, 'class="wp-manga-tags-list">', '</div>') + data = {"tags": list(text.split_html(tags)[::2])} info = text.extr(page, '<h1 id="chapter-heading">', "</h1>") if not info: raise exception.NotFoundError("chapter") @@ -148,7 +149,7 @@ class MangareadMangaExtractor(MangareadBase, MangaExtractor): } }), ("https://www.mangaread.org/manga/doesnotexist", { - "exception": exception.NotFoundError, + "exception": exception.HttpError, }), ) From bc9123cfee24e7c77b6f522d1db29014ae6a81ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 14 Jul 2023 22:41:36 +0200 Subject: [PATCH 218/252] [naverwebtoon] fix 'comic' metadata extraction --- gallery_dl/extractor/naverwebtoon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py index d6292af552..cafe4f7f9d 100644 --- a/gallery_dl/extractor/naverwebtoon.py +++ b/gallery_dl/extractor/naverwebtoon.py @@ -91,7 +91,7 @@ def metadata(self, page): return { "title_id": self.title_id, "episode" : self.episode, - "comic" : extr("titleName: '", "'"), + "comic" : extr('titleName: "', '"'), "tags" : [t.strip() for t in text.extract_iter( extr("tagList: [", "}],"), '"tagName":"', '"')], "title" : extr('"subtitle":"', '"'), From 248e8bc6996229fd8c3fb064afff14b2b722fa58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 15 Jul 2023 19:24:43 +0200 Subject: [PATCH 219/252] release version 1.25.8 --- CHANGELOG.md | 36 ++++++++++++++++++++++++++++++++++++ README.rst | 4 ++-- gallery_dl/version.py | 2 +- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b71b404378..53034fa5bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,41 @@ # Changelog +## 1.25.8 - 2023-07-15 +### Changes +- update default User-Agent header to Firefox 115 ESR +### Additions +- [gfycat] support `@me` user ([#3770](https://github.com/mikf/gallery-dl/issues/3770), [#4271](https://github.com/mikf/gallery-dl/issues/4271)) +- [gfycat] implement login support ([#3770](https://github.com/mikf/gallery-dl/issues/3770), [#4271](https://github.com/mikf/gallery-dl/issues/4271)) +- [reddit] notify users about registering an OAuth application ([#4292](https://github.com/mikf/gallery-dl/issues/4292)) +- [twitter] add `ratelimit` option ([#4251](https://github.com/mikf/gallery-dl/issues/4251)) +- [twitter] use `TweetResultByRestId` endpoint that allows accessing single Tweets without login ([#4250](https://github.com/mikf/gallery-dl/issues/4250)) +### Fixes +- [bunkr] use `.la` TLD for `media-files12` servers ([#4147](https://github.com/mikf/gallery-dl/issues/4147), [#4276](https://github.com/mikf/gallery-dl/issues/4276)) +- [erome] ignore duplicate album IDs +- [fantia] send `X-Requested-With` header ([#4273](https://github.com/mikf/gallery-dl/issues/4273)) +- [gelbooru_v01] fix `source` metadata ([#4302](https://github.com/mikf/gallery-dl/issues/4302), [#4303](https://github.com/mikf/gallery-dl/issues/4303)) +- [gelbooru_v01] update `vidyart` domain +- [jpgfish] update domain to `jpeg.pet` +- [mangaread] fix `tags` metadata extraction +- [naverwebtoon] fix `comic` metadata extraction +- [newgrounds] extract & pass auth token during login ([#4268](https://github.com/mikf/gallery-dl/issues/4268)) +- [paheal] fix extraction ([#4262](https://github.com/mikf/gallery-dl/issues/4262), [#4293](https://github.com/mikf/gallery-dl/issues/4293)) +- [paheal] unescape `source` +- [philomena] fix `--range` ([#4288](https://github.com/mikf/gallery-dl/issues/4288)) +- [philomena] handle `429 Too Many Requests` errors ([#4288](https://github.com/mikf/gallery-dl/issues/4288)) +- [pornhub] set `accessAgeDisclaimerPH` cookie ([#4301](https://github.com/mikf/gallery-dl/issues/4301)) +- [reddit] use 0.6s delay between API requests ([#4292](https://github.com/mikf/gallery-dl/issues/4292)) +- [seiga] set `skip_fetish_warning` cookie ([#4242](https://github.com/mikf/gallery-dl/issues/4242)) +- [slideshare] fix extraction +- [twitter] fix `following` extractor not getting all users ([#4287](https://github.com/mikf/gallery-dl/issues/4287)) +- [twitter] use GraphQL search endpoint by default ([#4264](https://github.com/mikf/gallery-dl/issues/4264)) +- [twitter] do not treat missing `TimelineAddEntries` instruction as fatal ([#4278](https://github.com/mikf/gallery-dl/issues/4278)) +- [weibo] fix cursor based pagination +- [wikifeet] fix `tag` extraction ([#4289](https://github.com/mikf/gallery-dl/issues/4289), [#4291](https://github.com/mikf/gallery-dl/issues/4291)) +### Removals +- [bcy] remove module +- [lineblog] remove module + ## 1.25.7 - 2023-07-02 ### Additions - [flickr] add 'exif' option diff --git a/README.rst b/README.rst index 86dd58deae..51e239c13c 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.7/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.8/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.7/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.8/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl/version.py b/gallery_dl/version.py index bb5c28b3c9..f2a3111644 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.8-dev" +__version__ = "1.25.8" From 12cd85658b9c7ba76ea07193c08ddefa559296f4 Mon Sep 17 00:00:00 2001 From: enduser420 <91022934+enduser420@users.noreply.github.com> Date: Sun, 16 Jul 2023 21:22:06 +0530 Subject: [PATCH 220/252] [redgifs] add 'niches' extractor --- docs/supportedsites.md | 2 +- gallery_dl/extractor/redgifs.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c1f25d0681..ecf4b73f8e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -730,7 +730,7 @@ Consider all sites to be NSFW unless otherwise known. <tr> <td>RedGIFs</td> <td>https://redgifs.com/</td> - <td>Collections, individual Images, Search Results, User Profiles</td> + <td>Collections, individual Images, Niches, Search Results, User Profiles</td> <td></td> </tr> <tr> diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index bfd18b5db4..9109e8dc31 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -158,6 +158,27 @@ def items(self): yield Message.Queue, url, collection +class RedgifsNichesExtractor(RedgifsExtractor): + """Extractor for redgifs niches""" + subcategory = "niches" + pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/niches/([^/?#]+)" + test = ( + ("https://www.redgifs.com/niches/boobs", { + "pattern": r"https://\w+\.redgifs\.com/[\w-]+\.mp4", + "range": "1-20", + "count": 20, + }), + ("https://www.redgifs.com/niches/ass", { + "pattern": r"https://\w+\.redgifs\.com/[\w-]+\.mp4", + "range": "1-20", + "count": 20, + }), + ) + + def gifs(self): + return self.api.niches(self.key) + + class RedgifsSearchExtractor(RedgifsExtractor): """Extractor for redgifs search results""" subcategory = "search" @@ -271,6 +292,10 @@ def collections(self, user): endpoint = "/v2/users/{}/collections".format(user) return self._pagination(endpoint, key="collections") + def niches(self, niche): + endpoint = "/v2/niches/{}/gifs".format(niche) + return self._pagination(endpoint) + def search(self, params): endpoint = "/v2/gifs/search" params["search_text"] = params.pop("tags", None) From d52ed2bc5a63d58166d4f3b797b7693f528832bb Mon Sep 17 00:00:00 2001 From: enduser420 <91022934+enduser420@users.noreply.github.com> Date: Tue, 18 Jul 2023 16:38:04 +0530 Subject: [PATCH 221/252] [zerochan] fix 'tags' extraction --- gallery_dl/extractor/zerochan.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 03fd909f0b..148b92afc9 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -78,7 +78,8 @@ def _parse_entry_html(self, entry_id): html = data["tags"] tags = data["tags"] = [] for tag in html.split("<li class=")[1:]: - category, _, name = text.extr(tag, 'alt="', '<').partition('">') + category = text.extr(tag, 'alt="', '"') + name = text.extr(tag, ">-->", "</a>") tags.append(category + ":" + name.strip()) return data From 86be197d111aa28b7a51686cd6abbcf43af97354 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 18 Jul 2023 15:45:37 +0200 Subject: [PATCH 222/252] [twitter] remove '/search/adaptive.json' --- docs/configuration.rst | 13 ------------- gallery_dl/extractor/twitter.py | 14 -------------- gallery_dl/version.py | 2 +- 3 files changed, 1 insertion(+), 28 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 1868fa788c..e4be90518e 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3230,19 +3230,6 @@ Description will be taken from the original Tweets, not the Retweets. -extractor.twitter.search-endpoint ---------------------------------- -Type - ``string`` -Default - ``"graphql"`` -Description - Selects the API endpoint used to retrieve search results. - - * ``"graphql"``: GraphQL endpoint - * ``"rest"``: Legacy REST endpoint - - extractor.twitter.timeline.strategy ----------------------------------- Type diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 7b9a2e4cb5..edfec3dc15 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1085,10 +1085,6 @@ def __init__(self, extractor): auth_token = cookies.get("auth_token", domain=cookiedomain) - search = extractor.config("search-endpoint") - if search == "rest": - self.search_timeline = self.search_adaptive - self.headers = { "Accept": "*/*", "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" @@ -1324,16 +1320,6 @@ def list_latest_tweets_timeline(self, list_id): return self._pagination_tweets( endpoint, variables, ("list", "tweets_timeline", "timeline")) - def search_adaptive(self, query): - endpoint = "/2/search/adaptive.json" - params = self.params.copy() - params["q"] = query - params["tweet_search_mode"] = "live" - params["query_source"] = "typed_query" - params["pc"] = "1" - params["spelling_corrections"] = "1" - return self._pagination_legacy(endpoint, params) - def search_timeline(self, query): endpoint = "/graphql/7jT5GT59P8IFjgxwqnEdQw/SearchTimeline" variables = { diff --git a/gallery_dl/version.py b/gallery_dl/version.py index f2a3111644..39cfbd1c5d 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.8" +__version__ = "1.26.0-dev" From 20ed647f6f2ffc721c6e2441b7883ca6aa4ac1d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 18 Jul 2023 16:42:55 +0200 Subject: [PATCH 223/252] [twitter] add 'user' extractor and 'include' option (#4275) --- docs/configuration.rst | 26 ++++++++++++++++ docs/supportedsites.md | 2 +- gallery_dl/extractor/twitter.py | 55 +++++++++++++++++++++++++-------- 3 files changed, 69 insertions(+), 14 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index e4be90518e..b78feb780e 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3102,6 +3102,32 @@ Description `syndication <extractor.twitter.syndication_>`__ API. +extractor.twitter.include +------------------------- +Type + * ``string`` + * ``list`` of ``strings`` +Default + ``"timeline"`` +Example + * ``"avatar,background,media"`` + * ``["avatar", "background", "media"]`` +Description + A (comma-separated) list of subcategories to include + when processing a user profile. + + Possible values are + ``"avatar"``, + ``"background"``, + ``"timeline"``, + ``"tweets"``, + ``"media"``, + ``"replies"``, + ``"likes"``. + + It is possible to use ``"all"`` instead of listing all values separately. + + extractor.twitter.transform --------------------------- Type diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c1f25d0681..6e314b14e6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -874,7 +874,7 @@ Consider all sites to be NSFW unless otherwise known. <tr> <td>Twitter</td> <td>https://twitter.com/</td> - <td>Avatars, Backgrounds, Bookmarks, Events, Hashtags, individual Images, Likes, Lists, List Members, Media Timelines, Search Results, Timelines, Tweets</td> + <td>Avatars, Backgrounds, Bookmarks, Events, Hashtags, individual Images, Likes, Lists, List Members, Media Timelines, Search Results, Timelines, Tweets, User Profiles</td> <td>Supported</td> </tr> <tr> diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index edfec3dc15..a5bc0eb0a2 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -461,23 +461,18 @@ def login(self): self._update_cookies(_login_impl(self, username, password)) -class TwitterTimelineExtractor(TwitterExtractor): - """Extractor for a Twitter user timeline""" - subcategory = "timeline" +class TwitterUserExtractor(TwitterExtractor): + """Extractor for a Twitter user""" + subcategory = "user" pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])" r"|i(?:/user/|ntent/user\?user_id=)(\d+))") test = ( ("https://twitter.com/supernaturepics", { - "range": "1-40", - "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40", - }), - # suspended account (#2216) - ("https://twitter.com/OptionalTypo", { - "exception": exception.NotFoundError, - }), - # suspended account user ID - ("https://twitter.com/id:772949683521978368", { - "exception": exception.NotFoundError, + "options": (("include", "all"),), + "pattern": r"https://twitter\.com/supernaturepics" + r"/(photo|header_photo|timeline|tweets" + r"|media|with_replies|likes)$", + "count": 7, }), ("https://mobile.twitter.com/supernaturepics?p=i"), ("https://www.twitter.com/id:2976459548"), @@ -493,6 +488,40 @@ def __init__(self, match): if user_id: self.user = "id:" + user_id + def items(self): + base = "{}/{}/".format(self.root, self.user) + return self._dispatch_extractors(( + (TwitterAvatarExtractor , base + "photo"), + (TwitterBackgroundExtractor, base + "header_photo"), + (TwitterTimelineExtractor , base + "timeline"), + (TwitterTweetsExtractor , base + "tweets"), + (TwitterMediaExtractor , base + "media"), + (TwitterRepliesExtractor , base + "with_replies"), + (TwitterLikesExtractor , base + "likes"), + ), ("timeline",)) + + +class TwitterTimelineExtractor(TwitterExtractor): + """Extractor for a Twitter user timeline""" + subcategory = "timeline" + pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/timeline(?!\w)" + test = ( + ("https://twitter.com/supernaturepics/timeline", { + "range": "1-40", + "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40", + }), + # suspended account (#2216) + ("https://twitter.com/OptionalTypo/timeline", { + "exception": exception.NotFoundError, + }), + # suspended account user ID + ("https://twitter.com/id:772949683521978368/timeline", { + "exception": exception.NotFoundError, + }), + ("https://mobile.twitter.com/supernaturepics/timeline#t"), + ("https://www.twitter.com/id:2976459548/timeline"), + ) + def tweets(self): # yield initial batch of (media) tweets tweet = None From 90231f2d5a2fa9ed08c5ed448a45cbe0e5fda12b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 18 Jul 2023 17:19:32 +0200 Subject: [PATCH 224/252] [twitter] add 'tweet-endpoint' option (#4307) use the newer TweetResultByRestId only for guests by default --- docs/configuration.rst | 14 ++++++++++++++ gallery_dl/extractor/twitter.py | 25 +++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index b78feb780e..b413c9573d 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3138,6 +3138,20 @@ Description Transform Tweet and User metadata into a simpler, uniform format. +extractor.twitter.tweet-endpoint +-------------------------------- +Type + ``string`` +Default + ``"auto"`` +Description + Selects the API endpoint used to retrieve single Tweets. + + * ``"restid"``: ``/TweetResultByRestId`` - accessible to guest users + * ``"detail"``: ``/TweetDetail`` - more stable + * ``"auto"``: ``"detail"`` when logged in, ``"restid"`` otherwise + + extractor.twitter.size ---------------------- Type diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index a5bc0eb0a2..092ddb4914 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -952,8 +952,13 @@ def tweets(self): if conversations: self._accessible = (conversations == "accessible") return self._tweets_conversation(self.tweet_id) - else: - return self._tweets_single(self.tweet_id) + + endpoint = self.config("tweet-endpoint") + if endpoint == "detail" or endpoint in (None, "auto") and \ + self.api.headers["x-twitter-auth-type"]: + return self._tweets_detail(self.tweet_id) + + return self._tweets_single(self.tweet_id) def _tweets_single(self, tweet_id): tweets = [] @@ -970,6 +975,22 @@ def _tweets_single(self, tweet_id): return tweets + def _tweets_detail(self, tweet_id): + tweets = [] + + for tweet in self.api.tweet_detail(tweet_id): + if tweet["rest_id"] == tweet_id or \ + tweet.get("_retweet_id_str") == tweet_id: + if self._user_obj is None: + self._assign_user(tweet["core"]["user_results"]["result"]) + tweets.append(tweet) + + tweet_id = tweet["legacy"].get("quoted_status_id_str") + if not tweet_id: + break + + return tweets + def _tweets_conversation(self, tweet_id): tweets = self.api.tweet_detail(tweet_id) buffer = [] From 22099422ca22a4aeff10535b52c8bd3039790590 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 18 Jul 2023 17:55:13 +0200 Subject: [PATCH 225/252] [deviantart] fix shortened URLs (#4316) --- gallery_dl/extractor/deviantart.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 18d9867b46..1ad2bd9626 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1105,11 +1105,14 @@ def __init__(self, match): match.group(4) or match.group(5) or id_from_base36(match.group(6)) def deviations(self): - url = "{}/{}/{}/{}".format( - self.root, self.user or "u", self.type or "art", self.deviation_id) + if self.user: + url = "{}/{}/{}/{}".format( + self.root, self.user, self.type or "art", self.deviation_id) + else: + url = "{}/view/{}/".format(self.root, self.deviation_id) - uuid = text.extract(self._limited_request(url).text, - '"deviationUuid\\":\\"', '\\')[0] + uuid = text.extr(self._limited_request(url).text, + '"deviationUuid\\":\\"', '\\') if not uuid: raise exception.NotFoundError("deviation") return (self.api.deviation(uuid),) From a996d936d2caf6efeef52c53e5a5de99e5b3f593 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 18 Jul 2023 17:56:33 +0200 Subject: [PATCH 226/252] [imagefap] fix pagination (#3013) --- gallery_dl/extractor/imagefap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index c91347e65b..f5b69faa2e 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -283,7 +283,7 @@ def galleries(self, folder_id): yield gid, extr("<b>", "<") cnt += 1 - if cnt < 25: + if cnt < 20: break params["page"] += 1 From 5171d8975c47efc95a3cace4d6c31c79c5a66155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 18 Jul 2023 18:16:30 +0200 Subject: [PATCH 227/252] [E621] support 'e6ai.net' (#4320) --- docs/supportedsites.md | 6 ++++++ gallery_dl/extractor/e621.py | 23 +++++++++++++++++++++++ scripts/supportedsites.py | 1 + test/test_results.py | 2 +- 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 6e314b14e6..09927ec95c 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1041,6 +1041,12 @@ Consider all sites to be NSFW unless otherwise known. <td>Favorites, Pools, Popular Images, Posts, Tag Searches</td> <td>Supported</td> </tr> +<tr> + <td>e6AI</td> + <td>https://e6ai.net/</td> + <td>Favorites, Pools, Popular Images, Posts, Tag Searches</td> + <td></td> +</tr> <tr> <td colspan="4"><strong>Gelbooru Beta 0.1.11</strong></td> diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index d4f6cd4b35..cb1aea40e0 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -74,6 +74,10 @@ def items(self): "root": "https://e926.net", "pattern": r"e926\.net", }, + "e6ai": { + "root": "https://e6ai.net", + "pattern": r"e6ai\.net", + }, }) @@ -94,6 +98,10 @@ class E621TagExtractor(E621Extractor, danbooru.DanbooruTagExtractor): }), ("https://e926.net/post/index/1/anry"), ("https://e926.net/post?tags=anry"), + + ("https://e6ai.net/posts?tags=anry"), + ("https://e6ai.net/post/index/1/anry"), + ("https://e6ai.net/post?tags=anry"), ) @@ -112,6 +120,11 @@ class E621PoolExtractor(E621Extractor, danbooru.DanbooruPoolExtractor): "content": "91abe5d5334425d9787811d7f06d34c77974cd22", }), ("https://e926.net/pool/show/73"), + + ("https://e6ai.net/pools/3", { + "url": "a6d1ad67a3fa9b9f73731d34d5f6f26f7e85855f", + }), + ("https://e6ai.net/pool/show/3"), ) def posts(self): @@ -192,6 +205,12 @@ class E621PostExtractor(E621Extractor, danbooru.DanbooruPostExtractor): "content": "66f46e96a893fba8e694c4e049b23c2acc9af462", }), ("https://e926.net/post/show/535"), + + ("https://e6ai.net/posts/23", { + "url": "3c85a806b3d9eec861948af421fe0e8ad6b8f881", + "content": "a05a484e4eb64637d56d751c02e659b4bc8ea5d5", + }), + ("https://e6ai.net/post/show/23"), ) def posts(self): @@ -216,6 +235,8 @@ class E621PopularExtractor(E621Extractor, danbooru.DanbooruPopularExtractor): "pattern": r"https://static\d.e926.net/data/../../[0-9a-f]+", "count": ">= 70", }), + + ("https://e6ai.net/explore/posts/popular"), ) def posts(self): @@ -240,6 +261,8 @@ class E621FavoriteExtractor(E621Extractor): "pattern": r"https://static\d.e926.net/data/../../[0-9a-f]+", "count": "> 260", }), + + ("https://e6ai.net/favorites"), ) def __init__(self, match): diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index c9fc0ab5cc..5415276625 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -35,6 +35,7 @@ "dynastyscans" : "Dynasty Reader", "e621" : "e621", "e926" : "e926", + "e6ai" : "e6AI", "erome" : "EroMe", "e-hentai" : "E-Hentai", "exhentai" : "ExHentai", diff --git a/test/test_results.py b/test/test_results.py index 3c7d2844f7..9899d1c13b 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -322,7 +322,7 @@ def setup_test_config(): config.set(("extractor", "mangoxo") , "username", "LiQiang3") config.set(("extractor", "mangoxo") , "password", "5zbQF10_5u25259Ma") - for category in ("danbooru", "atfbooru", "aibooru", "e621", "e926", + for category in ("danbooru", "atfbooru", "aibooru", "e621", "e926", "e6ai", "instagram", "twitter", "subscribestar", "deviantart", "inkbunny", "tapas", "pillowfort", "mangadex", "vipergirls", "gfycat"): From 63326e3168f50db1fb3b4ad696676f472f8b2b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 18 Jul 2023 18:29:57 +0200 Subject: [PATCH 228/252] [danbooru] add tests for booruvar --- docs/supportedsites.md | 6 ++++++ gallery_dl/extractor/danbooru.py | 15 ++++++++++++++- test/test_results.py | 3 ++- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e42cdf675e..63ac6d314a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1019,6 +1019,12 @@ Consider all sites to be NSFW unless otherwise known. <td>Pools, Popular Images, Posts, Tag Searches</td> <td>Supported</td> </tr> +<tr> + <td>Booruvar</td> + <td>https://booru.borvar.art/</td> + <td>Pools, Popular Images, Posts, Tag Searches</td> + <td></td> +</tr> <tr> <td colspan="4"><strong>e621 Instances</strong></td> diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 0ce77ad361..19a3aeff18 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -163,7 +163,7 @@ def _ugoira_frames(self, post): }, "booruvar": { "root": "https://booru.borvar.art", - "pattern": r"booru\.borvar\.art" + "pattern": r"booru\.borvar\.art", }, }) @@ -196,6 +196,11 @@ class DanbooruTagExtractor(DanbooruExtractor): r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.\w+", "count": ">= 3", }), + ("https://booru.borvar.art/posts?tags=chibi&z=1", { + "pattern": r"https://booru\.borvar\.art/data/original" + r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.\w+", + "count": ">= 3", + }), ("https://hijiribe.donmai.us/posts?tags=bonocho"), ("https://sonohara.donmai.us/posts?tags=bonocho"), ("https://safebooru.donmai.us/posts?tags=bonocho"), @@ -242,6 +247,10 @@ class DanbooruPoolExtractor(DanbooruExtractor): "url": "902549ffcdb00fe033c3f63e12bc3cb95c5fd8d5", "count": 6, }), + ("https://booru.borvar.art/pools/2", { + "url": "77fa3559a3fc919f72611f4e3dd0f919d19d3e0d", + "count": 4, + }), ("https://aibooru.online/pools/1"), ("https://danbooru.donmai.us/pool/show/7659"), ) @@ -282,6 +291,9 @@ class DanbooruPostExtractor(DanbooruExtractor): ("https://aibooru.online/posts/1", { "content": "54d548743cd67799a62c77cbae97cfa0fec1b7e9", }), + ("https://booru.borvar.art/posts/1487", { + "content": "91273ac1ea413a12be468841e2b5804656a50bff", + }), ("https://danbooru.donmai.us/post/show/294929"), ) @@ -313,6 +325,7 @@ class DanbooruPopularExtractor(DanbooruExtractor): }), ("https://booru.allthefallen.moe/explore/posts/popular"), ("https://aibooru.online/explore/posts/popular"), + ("https://booru.borvar.art/explore/posts/popular"), ) def __init__(self, match): diff --git a/test/test_results.py b/test/test_results.py index d28496b3dd..72654a9373 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -322,7 +322,8 @@ def setup_test_config(): config.set(("extractor", "mangoxo") , "username", "LiQiang3") config.set(("extractor", "mangoxo") , "password", "5zbQF10_5u25259Ma") - for category in ("danbooru", "atfbooru", "aibooru", "e621", "e926", + for category in ("danbooru", "atfbooru", "aibooru", "booruvar", + "e621", "e926", "instagram", "twitter", "subscribestar", "deviantart", "inkbunny", "tapas", "pillowfort", "mangadex"): config.set(("extractor", category), "username", None) From ceebacc9e13520255f46bc9625c1bcd60fbfec60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 19 Jul 2023 20:44:07 +0200 Subject: [PATCH 229/252] remove 'pyopenssl' option --- docs/configuration.rst | 11 ----------- gallery_dl/extractor/common.py | 10 ---------- 2 files changed, 21 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index b413c9573d..fc531632b2 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -5065,17 +5065,6 @@ Description used for (urllib3) warnings. -pyopenssl ---------- -Type - ``bool`` -Default - ``false`` -Description - Use `pyOpenSSL <https://www.pyopenssl.org/en/stable/>`__-backed - SSL-support. - - API Tokens & IDs ================ diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 5c9b157787..3cb5fc417f 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -897,13 +897,3 @@ def _browser_useragent(): except Exception: pass del action - -# Undo automatic pyOpenSSL injection by requests -pyopenssl = config.get((), "pyopenssl", False) -if not pyopenssl: - try: - from requests.packages.urllib3.contrib import pyopenssl # noqa - pyopenssl.extract_from_urllib3() - except ImportError: - pass -del pyopenssl From d97b8c2fba2c5b09955563d03f473cb11e1aa1df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 21 Jul 2023 22:38:39 +0200 Subject: [PATCH 230/252] consistent cookie-related names - rename every cookie variable or method to 'cookies_*' - simplify '.session.cookies' to just '.cookies' - more consistent 'login()' structure --- gallery_dl/extractor/500px.py | 6 +- gallery_dl/extractor/8chan.py | 8 +-- gallery_dl/extractor/aryion.py | 11 ++-- gallery_dl/extractor/common.py | 66 ++++++++++---------- gallery_dl/extractor/deviantart.py | 25 ++++---- gallery_dl/extractor/erome.py | 2 +- gallery_dl/extractor/exhentai.py | 34 ++++++----- gallery_dl/extractor/fanbox.py | 3 +- gallery_dl/extractor/fantia.py | 2 +- gallery_dl/extractor/flickr.py | 2 +- gallery_dl/extractor/furaffinity.py | 8 +-- gallery_dl/extractor/gofile.py | 2 +- gallery_dl/extractor/hentaifoundry.py | 8 +-- gallery_dl/extractor/idolcomplex.py | 18 +++--- gallery_dl/extractor/imagebam.py | 2 +- gallery_dl/extractor/imgbb.py | 4 +- gallery_dl/extractor/instagram.py | 22 ++++--- gallery_dl/extractor/itchio.py | 2 +- gallery_dl/extractor/kemonoparty.py | 8 +-- gallery_dl/extractor/luscious.py | 2 +- gallery_dl/extractor/mangahere.py | 2 +- gallery_dl/extractor/mangasee.py | 2 +- gallery_dl/extractor/mangoxo.py | 8 +-- gallery_dl/extractor/mastodon.py | 2 +- gallery_dl/extractor/newgrounds.py | 13 ++-- gallery_dl/extractor/nijie.py | 17 +++--- gallery_dl/extractor/nitter.py | 4 +- gallery_dl/extractor/paheal.py | 2 +- gallery_dl/extractor/patreon.py | 6 +- gallery_dl/extractor/pillowfort.py | 11 ++-- gallery_dl/extractor/pixiv.py | 4 +- gallery_dl/extractor/pornhub.py | 2 +- gallery_dl/extractor/reddit.py | 6 +- gallery_dl/extractor/sankaku.py | 2 +- gallery_dl/extractor/seiga.py | 6 +- gallery_dl/extractor/senmanga.py | 2 +- gallery_dl/extractor/shimmie2.py | 2 +- gallery_dl/extractor/smugmug.py | 2 +- gallery_dl/extractor/subscribestar.py | 12 ++-- gallery_dl/extractor/tapas.py | 23 ++++--- gallery_dl/extractor/tsumino.py | 8 +-- gallery_dl/extractor/tumblr.py | 2 +- gallery_dl/extractor/twitter.py | 36 +++++------ gallery_dl/extractor/vipergirls.py | 14 +++-- gallery_dl/extractor/webtoons.py | 4 +- gallery_dl/extractor/weibo.py | 2 +- gallery_dl/extractor/ytdl.py | 2 +- gallery_dl/extractor/zerochan.py | 18 +++--- gallery_dl/job.py | 2 +- test/test_cookies.py | 88 +++++++++++++-------------- 50 files changed, 278 insertions(+), 261 deletions(-) diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index 1213194a4d..ac38b60406 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -21,7 +21,7 @@ class _500pxExtractor(Extractor): filename_fmt = "{id}_{name}.{extension}" archive_fmt = "{id}" root = "https://500px.com" - cookiedomain = ".500px.com" + cookies_domain = ".500px.com" def __init__(self, match): Extractor.__init__(self, match) @@ -73,7 +73,7 @@ def _extend(self, edges): def _request_api(self, url, params): headers = { "Origin": self.root, - "x-csrf-token": self.session.cookies.get( + "x-csrf-token": self.cookies.get( "x-csrf-token", domain=".500px.com"), } return self.request(url, headers=headers, params=params).json() @@ -81,7 +81,7 @@ def _request_api(self, url, params): def _request_graphql(self, opname, variables): url = "https://api.500px.com/graphql" headers = { - "x-csrf-token": self.session.cookies.get( + "x-csrf-token": self.cookies.get( "x-csrf-token", domain=".500px.com"), } data = { diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index 0e128c3a3a..f098008a60 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -27,7 +27,7 @@ def __init__(self, match): Extractor.__init__(self, match) @memcache() - def _prepare_cookies(self): + def cookies_prepare(self): # fetch captcha cookies # (necessary to download without getting interrupted) now = datetime.utcnow() @@ -39,14 +39,14 @@ def _prepare_cookies(self): # - remove 'expires' timestamp # - move 'captchaexpiration' value forward by 1 month) domain = self.root.rpartition("/")[2] - for cookie in self.session.cookies: + for cookie in self.cookies: if cookie.domain.endswith(domain): cookie.expires = None if cookie.name == "captchaexpiration": cookie.value = (now + timedelta(30, 300)).strftime( "%a, %d %b %Y %H:%M:%S GMT") - return self.session.cookies + return self.cookies class _8chanThreadExtractor(_8chanExtractor): @@ -113,7 +113,7 @@ def items(self): thread["_http_headers"] = {"Referer": url + "html"} try: - self.session.cookies = self._prepare_cookies() + self.cookies = self.cookies_prepare() except Exception as exc: self.log.debug("Failed to fetch captcha cookies: %s: %s", exc.__class__.__name__, exc, exc_info=True) diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index 6f0157282a..ad0f9dc672 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -23,8 +23,8 @@ class AryionExtractor(Extractor): directory_fmt = ("{category}", "{user!l}", "{path:J - }") filename_fmt = "{id} {title}.{extension}" archive_fmt = "{id}" - cookiedomain = ".aryion.com" - cookienames = ("phpbb3_rl7a3_sid",) + cookies_domain = ".aryion.com" + cookies_names = ("phpbb3_rl7a3_sid",) root = "https://aryion.com" def __init__(self, match): @@ -33,11 +33,12 @@ def __init__(self, match): self.recursive = True def login(self): - if self._check_cookies(self.cookienames): + if self.cookies_check(self.cookies_names): return + username, password = self._get_auth_info() if username: - self._update_cookies(self._login_impl(username, password)) + self.cookies_update(self._login_impl(username, password)) @cache(maxage=14*24*3600, keyarg=1) def _login_impl(self, username, password): @@ -53,7 +54,7 @@ def _login_impl(self, username, password): response = self.request(url, method="POST", data=data) if b"You have been successfully logged in." not in response.content: raise exception.AuthenticationError() - return {c: response.cookies[c] for c in self.cookienames} + return {c: response.cookies[c] for c in self.cookies_names} def items(self): self.login() diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 3cb5fc417f..2e5ce4d4a6 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -32,7 +32,7 @@ class Extractor(): directory_fmt = ("{category}",) filename_fmt = "{filename}.{extension}" archive_fmt = "" - cookiedomain = "" + cookies_domain = "" browser = None root = "" test = None @@ -330,26 +330,26 @@ def _init_session(self): def _init_cookies(self): """Populate the session's cookiejar""" - self._cookiefile = None - self._cookiejar = self.session.cookies - if self.cookiedomain is None: + self.cookies = self.session.cookies + self.cookies_file = None + if self.cookies_domain is None: return cookies = self.config("cookies") if cookies: if isinstance(cookies, dict): - self._update_cookies_dict(cookies, self.cookiedomain) + self.cookies_update_dict(cookies, self.cookies_domain) elif isinstance(cookies, str): - cookiefile = util.expand_path(cookies) + path = util.expand_path(cookies) try: - with open(cookiefile) as fp: - util.cookiestxt_load(fp, self._cookiejar) + with open(path) as fp: + util.cookiestxt_load(fp, self.cookies) except Exception as exc: self.log.warning("cookies: %s", exc) else: self.log.debug("Loading cookies from '%s'", cookies) - self._cookiefile = cookiefile + self.cookies_file = path elif isinstance(cookies, (list, tuple)): key = tuple(cookies) @@ -357,7 +357,7 @@ def _init_cookies(self): if cookiejar is None: from ..cookies import load_cookies - cookiejar = self._cookiejar.__class__() + cookiejar = self.cookies.__class__() try: load_cookies(cookiejar, cookies) except Exception as exc: @@ -367,9 +367,9 @@ def _init_cookies(self): else: self.log.debug("Using cached cookies from %s", key) - setcookie = self._cookiejar.set_cookie + set_cookie = self.cookies.set_cookie for cookie in cookiejar: - setcookie(cookie) + set_cookie(cookie) else: self.log.warning( @@ -377,8 +377,8 @@ def _init_cookies(self): "option, got '%s' (%s)", cookies.__class__.__name__, cookies) - def _store_cookies(self): - """Store the session's cookiejar in a cookies.txt file""" + def cookies_store(self): + """Store the session's cookies in a cookies.txt file""" export = self.config("cookies-update", True) if not export: return @@ -386,47 +386,47 @@ def _store_cookies(self): if isinstance(export, str): path = util.expand_path(export) else: - path = self._cookiefile + path = self.cookies_file if not path: return try: with open(path, "w") as fp: - util.cookiestxt_store(fp, self._cookiejar) + util.cookiestxt_store(fp, self.cookies) except OSError as exc: self.log.warning("cookies: %s", exc) - def _update_cookies(self, cookies, domain=""): + def cookies_update(self, cookies, domain=""): """Update the session's cookiejar with 'cookies'""" if isinstance(cookies, dict): - self._update_cookies_dict(cookies, domain or self.cookiedomain) + self.cookies_update_dict(cookies, domain or self.cookies_domain) else: - setcookie = self._cookiejar.set_cookie + set_cookie = self.cookies.set_cookie try: cookies = iter(cookies) except TypeError: - setcookie(cookies) + set_cookie(cookies) else: for cookie in cookies: - setcookie(cookie) + set_cookie(cookie) - def _update_cookies_dict(self, cookiedict, domain): + def cookies_update_dict(self, cookiedict, domain): """Update cookiejar with name-value pairs from a dict""" - setcookie = self._cookiejar.set + set_cookie = self.cookies.set for name, value in cookiedict.items(): - setcookie(name, value, domain=domain) + set_cookie(name, value, domain=domain) - def _check_cookies(self, cookienames, domain=None): - """Check if all 'cookienames' are in the session's cookiejar""" - if not self._cookiejar: + def cookies_check(self, cookies_names, domain=None): + """Check if all 'cookies_names' are in the session's cookiejar""" + if not self.cookies: return False if domain is None: - domain = self.cookiedomain - names = set(cookienames) + domain = self.cookies_domain + names = set(cookies_names) now = time.time() - for cookie in self._cookiejar: + for cookie in self.cookies: if cookie.name in names and ( not domain or cookie.domain == domain): @@ -450,9 +450,9 @@ def _check_cookies(self, cookienames, domain=None): return False def _prepare_ddosguard_cookies(self): - if not self._cookiejar.get("__ddg2", domain=self.cookiedomain): - self._cookiejar.set( - "__ddg2", util.generate_token(), domain=self.cookiedomain) + if not self.cookies.get("__ddg2", domain=self.cookies_domain): + self.cookies.set( + "__ddg2", util.generate_token(), domain=self.cookies_domain) def _get_date_min_max(self, dmin=None, dmax=None): """Retrieve and parse 'date-min' and 'date-max' config values""" diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 1ad2bd9626..9f16b3345c 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -32,8 +32,8 @@ class DeviantartExtractor(Extractor): root = "https://www.deviantart.com" directory_fmt = ("{category}", "{username}") filename_fmt = "{category}_{index}_{title}.{extension}" - cookiedomain = None - cookienames = ("auth", "auth_secure", "userinfo") + cookies_domain = None + cookies_names = ("auth", "auth_secure", "userinfo") _last_request = 0 def __init__(self, match): @@ -71,12 +71,13 @@ def skip(self, num): return num def login(self): - if not self._check_cookies(self.cookienames): - username, password = self._get_auth_info() - if not username: - return False - self._update_cookies(_login_impl(self, username, password)) - return True + if self.cookies_check(self.cookies_names): + return True + + username, password = self._get_auth_info() + if username: + self.cookies_update(_login_impl(self, username, password)) + return True def items(self): self.api = DeviantartOAuthAPI(self) @@ -1123,7 +1124,7 @@ class DeviantartScrapsExtractor(DeviantartExtractor): subcategory = "scraps" directory_fmt = ("{category}", "{username}", "Scraps") archive_fmt = "s_{_username}_{index}.{extension}" - cookiedomain = ".deviantart.com" + cookies_domain = ".deviantart.com" pattern = BASE_PATTERN + r"/gallery/(?:\?catpath=)?scraps\b" test = ( ("https://www.deviantart.com/shimoda7/gallery/scraps", { @@ -1146,7 +1147,7 @@ class DeviantartSearchExtractor(DeviantartExtractor): subcategory = "search" directory_fmt = ("{category}", "Search", "{search_tags}") archive_fmt = "Q_{search_tags}_{index}.{extension}" - cookiedomain = ".deviantart.com" + cookies_domain = ".deviantart.com" pattern = (r"(?:https?://)?www\.deviantart\.com" r"/search(?:/deviations)?/?\?([^#]+)") test = ( @@ -1205,7 +1206,7 @@ class DeviantartGallerySearchExtractor(DeviantartExtractor): """Extractor for deviantart gallery searches""" subcategory = "gallery-search" archive_fmt = "g_{_username}_{index}.{extension}" - cookiedomain = ".deviantart.com" + cookies_domain = ".deviantart.com" pattern = BASE_PATTERN + r"/gallery/?\?(q=[^#]+)" test = ( ("https://www.deviantart.com/shimoda7/gallery?q=memory", { @@ -1869,7 +1870,7 @@ def _login_impl(extr, username, password): return { cookie.name: cookie.value - for cookie in extr.session.cookies + for cookie in extr.cookies } diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 709bc5769b..cb527410f3 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -65,7 +65,7 @@ def albums(self): def request(self, url, **kwargs): if self.__cookies: self.__cookies = False - self.session.cookies.update(_cookie_cache()) + self.cookies.update(_cookie_cache()) for _ in range(5): response = Extractor.request(self, url, **kwargs) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 9cd7ae4e67..087ff51cc2 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -23,8 +23,8 @@ class ExhentaiExtractor(Extractor): directory_fmt = ("{category}", "{gid} {title[:247]}") filename_fmt = "{gid}_{num:>04}_{image_token}_{filename}.{extension}" archive_fmt = "{gid}_{num}" - cookienames = ("ipb_member_id", "ipb_pass_hash") - cookiedomain = ".exhentai.org" + cookies_domain = ".exhentai.org" + cookies_names = ("ipb_member_id", "ipb_pass_hash") root = "https://exhentai.org" request_interval = 5.0 @@ -39,7 +39,7 @@ def __init__(self, match): if domain == "auto": domain = ("ex" if version == "ex" else "e-") + "hentai.org" self.root = "https://" + domain - self.cookiedomain = "." + domain + self.cookies_domain = "." + domain Extractor.__init__(self, match) self.original = self.config("original", True) @@ -53,7 +53,7 @@ def __init__(self, match): self.session.headers["Referer"] = self.root + "/" if version != "ex": - self.session.cookies.set("nw", "1", domain=self.cookiedomain) + self.cookies.set("nw", "1", domain=self.cookies_domain) def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) @@ -66,17 +66,20 @@ def login(self): """Login and set necessary cookies""" if self.LIMIT: raise exception.StopExtraction("Image limit reached!") - if self._check_cookies(self.cookienames): + + if self.cookies_check(self.cookies_names): return + username, password = self._get_auth_info() if username: - self._update_cookies(self._login_impl(username, password)) - else: - self.log.info("no username given; using e-hentai.org") - self.root = "https://e-hentai.org" - self.original = False - self.limits = False - self.session.cookies["nw"] = "1" + return self.cookies_update(self._login_impl(username, password)) + + self.log.info("no username given; using e-hentai.org") + self.root = "https://e-hentai.org" + self.cookies_domain = ".e-hentai.org" + self.cookies.set("nw", "1", domain=self.cookies_domain) + self.original = False + self.limits = False @cache(maxage=90*24*3600, keyarg=1) def _login_impl(self, username, password): @@ -97,7 +100,7 @@ def _login_impl(self, username, password): response = self.request(url, method="POST", headers=headers, data=data) if b"You are now logged in as:" not in response.content: raise exception.AuthenticationError() - return {c: response.cookies[c] for c in self.cookienames} + return {c: response.cookies[c] for c in self.cookies_names} class ExhentaiGalleryExtractor(ExhentaiExtractor): @@ -390,8 +393,9 @@ def _update_limits(self): url = "https://e-hentai.org/home.php" cookies = { cookie.name: cookie.value - for cookie in self.session.cookies - if cookie.domain == self.cookiedomain and cookie.name != "igneous" + for cookie in self.cookies + if cookie.domain == self.cookies_domain and + cookie.name != "igneous" } page = self.request(url, cookies=cookies).text diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 373529f447..40ad8cdd95 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -32,9 +32,8 @@ def __init__(self, match): self.embeds = self.config("embeds", True) def items(self): - if self._warning: - if not self._check_cookies(("FANBOXSESSID",)): + if not self.cookies_check(("FANBOXSESSID",)): self.log.warning("no 'FANBOXSESSID' cookie set") FanboxExtractor._warning = False diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index f92b904633..3679e3750b 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -35,7 +35,7 @@ def items(self): } if self._warning: - if not self._check_cookies(("_session_id",)): + if not self.cookies_check(("_session_id",)): self.log.warning("no '_session_id' cookie set") FantiaExtractor._warning = False diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index d44ff3c842..9f97a3316b 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -20,7 +20,7 @@ class FlickrExtractor(Extractor): filename_fmt = "{category}_{id}.{extension}" directory_fmt = ("{category}", "{user[username]}") archive_fmt = "{id}" - cookiedomain = None + cookies_domain = None def __init__(self, match): Extractor.__init__(self, match) diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index ec9cd94091..c03c89b2ef 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -20,7 +20,8 @@ class FuraffinityExtractor(Extractor): directory_fmt = ("{category}", "{user!l}") filename_fmt = "{id}{title:? //}.{extension}" archive_fmt = "{id}" - cookiedomain = ".furaffinity.net" + cookies_domain = ".furaffinity.net" + cookies_names = ("a", "b") root = "https://www.furaffinity.net" _warning = True @@ -39,9 +40,8 @@ def __init__(self, match): self._new_layout = None def items(self): - if self._warning: - if not self._check_cookies(("a", "b")): + if not self.cookies_check(self.cookies_names): self.log.warning("no 'a' and 'b' session cookies set") FuraffinityExtractor._warning = False @@ -371,7 +371,7 @@ def posts(self): class FuraffinityUserExtractor(FuraffinityExtractor): """Extractor for furaffinity user profiles""" subcategory = "user" - cookiedomain = None + cookies_domain = None pattern = BASE_PATTERN + r"/user/([^/?#]+)" test = ( ("https://www.furaffinity.net/user/mirlinthloth/", { diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py index 044dddbd8d..60886a9dbf 100644 --- a/gallery_dl/extractor/gofile.py +++ b/gallery_dl/extractor/gofile.py @@ -72,7 +72,7 @@ def items(self): token = self.config("api-token") if not token: token = self._create_account() - self.session.cookies.set("accountToken", token, domain=".gofile.io") + self.cookies.set("accountToken", token, domain=".gofile.io") self.api_token = token self.website_token = (self.config("website-token") or diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index e01a4ed8ad..78a576df84 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -20,7 +20,7 @@ class HentaifoundryExtractor(Extractor): directory_fmt = ("{category}", "{user}") filename_fmt = "{category}_{index}_{title}.{extension}" archive_fmt = "{index}" - cookiedomain = "www.hentai-foundry.com" + cookies_domain = "www.hentai-foundry.com" root = "https://www.hentai-foundry.com" per_page = 25 @@ -123,14 +123,14 @@ def _parse_story(self, html): def _init_site_filters(self): """Set site-internal filters to show all images""" - if self.session.cookies.get("PHPSESSID", domain=self.cookiedomain): + if self.cookies.get("PHPSESSID", domain=self.cookies_domain): return url = self.root + "/?enterAgree=1" self.request(url, method="HEAD") - csrf_token = self.session.cookies.get( - "YII_CSRF_TOKEN", domain=self.cookiedomain) + csrf_token = self.cookies.get( + "YII_CSRF_TOKEN", domain=self.cookies_domain) if not csrf_token: self.log.warning("Unable to update site content filters") return diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index ce68d6d356..02f037dd85 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -19,9 +19,9 @@ class IdolcomplexExtractor(SankakuExtractor): """Base class for idolcomplex extractors""" category = "idolcomplex" - cookienames = ("login", "pass_hash") - cookiedomain = "idol.sankakucomplex.com" - root = "https://" + cookiedomain + cookies_domain = "idol.sankakucomplex.com" + cookies_names = ("login", "pass_hash") + root = "https://" + cookies_domain request_interval = 5.0 def __init__(self, match): @@ -51,14 +51,14 @@ def post_ids(self): """Return an iterable containing all relevant post ids""" def login(self): - if self._check_cookies(self.cookienames): + if self.cookies_check(self.cookies_names): return + username, password = self._get_auth_info() if username: - cookies = self._login_impl(username, password) - self._update_cookies(cookies) - else: - self.logged_in = False + return self.cookies_update(self._login_impl(username, password)) + + self.logged_in = False @cache(maxage=90*24*3600, keyarg=1) def _login_impl(self, username, password): @@ -76,7 +76,7 @@ def _login_impl(self, username, password): if not response.history or response.url != self.root + "/user/home": raise exception.AuthenticationError() cookies = response.history[0].cookies - return {c: cookies[c] for c in self.cookienames} + return {c: cookies[c] for c in self.cookies_names} def _parse_post(self, post_id): """Extract metadata of a single post""" diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index f993db8388..67d0b11052 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -21,7 +21,7 @@ class ImagebamExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.path = match.group(1) - self.session.cookies.set("nsfw_inter", "1", domain="www.imagebam.com") + self.cookies.set("nsfw_inter", "1", domain="www.imagebam.com") def _parse_image_page(self, path): page = self.request(self.root + path).text diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index a221075240..ee979a6574 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -62,7 +62,7 @@ def items(self): def login(self): username, password = self._get_auth_info() if username: - self._update_cookies(self._login_impl(username, password)) + self.cookies_update(self._login_impl(username, password)) @cache(maxage=360*24*3600, keyarg=1) def _login_impl(self, username, password): @@ -82,7 +82,7 @@ def _login_impl(self, username, password): if not response.history: raise exception.AuthenticationError() - return self.session.cookies + return self.cookies def _pagination(self, page, endpoint, params): data = None diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index faeffa6a93..29208aef07 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -27,8 +27,8 @@ class InstagramExtractor(Extractor): filename_fmt = "{sidecar_media_id:?/_/}{media_id}.{extension}" archive_fmt = "{media_id}" root = "https://www.instagram.com" - cookiedomain = ".instagram.com" - cookienames = ("sessionid",) + cookies_domain = ".instagram.com" + cookies_names = ("sessionid",) request_interval = (6.0, 12.0) def __init__(self, match): @@ -44,6 +44,8 @@ def __init__(self, match): def items(self): self.login() + self.cookies.set( + "csrftoken", self.csrf_token, domain=self.cookies_domain) if self.config("api") == "graphql": self.api = InstagramGraphqlAPI(self) @@ -131,14 +133,14 @@ def request(self, url, **kwargs): return response def login(self): - if not self._check_cookies(self.cookienames): - username, password = self._get_auth_info() - if username: - self._update_cookies(_login_impl(self, username, password)) - else: - self._logged_in = False - self.session.cookies.set( - "csrftoken", self.csrf_token, domain=self.cookiedomain) + if self.cookies_check(self.cookies_names): + return + + username, password = self._get_auth_info() + if username: + return self.cookies_update(_login_impl(self, username, password)) + + self._logged_in = False def _parse_post_rest(self, post): if "items" in post: # story or highlight diff --git a/gallery_dl/extractor/itchio.py b/gallery_dl/extractor/itchio.py index 6034d12251..96ebbdc4c2 100644 --- a/gallery_dl/extractor/itchio.py +++ b/gallery_dl/extractor/itchio.py @@ -63,7 +63,7 @@ def items(self): "Origin": "https://{}.itch.io".format(self.user), } data = { - "csrf_token": text.unquote(self.session.cookies["itchio_token"]), + "csrf_token": text.unquote(self.cookies["itchio_token"]), } for upload_id in text.extract_iter(page, 'data-upload_id="', '"'): diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 5aeefeba98..d5d02c2934 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -26,14 +26,14 @@ class KemonopartyExtractor(Extractor): directory_fmt = ("{category}", "{service}", "{user}") filename_fmt = "{id}_{title}_{num:>02}_{filename[:180]}.{extension}" archive_fmt = "{service}_{user}_{id}_{num}" - cookiedomain = ".kemono.party" + cookies_domain = ".kemono.party" def __init__(self, match): domain = match.group(1) tld = match.group(2) self.category = domain + "party" self.root = text.root_from_url(match.group(0)) - self.cookiedomain = ".{}.{}".format(domain, tld) + self.cookies_domain = ".{}.{}".format(domain, tld) Extractor.__init__(self, match) self.session.headers["Referer"] = self.root + "/" @@ -126,8 +126,8 @@ def items(self): def login(self): username, password = self._get_auth_info() if username: - self._update_cookies(self._login_impl( - (username, self.cookiedomain), password)) + self.cookies_update(self._login_impl( + (username, self.cookies_domain), password)) @cache(maxage=28*24*3600, keyarg=1) def _login_impl(self, username, password): diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index 57db0c9de0..80f8758c17 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -15,7 +15,7 @@ class LusciousExtractor(Extractor): """Base class for luscious extractors""" category = "luscious" - cookiedomain = ".luscious.net" + cookies_domain = ".luscious.net" root = "https://members.luscious.net" def _graphql(self, op, variables, query): diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py index 531aef489e..ccce09b4e7 100644 --- a/gallery_dl/extractor/mangahere.py +++ b/gallery_dl/extractor/mangahere.py @@ -114,7 +114,7 @@ class MangahereMangaExtractor(MangahereBase, MangaExtractor): def __init__(self, match): MangaExtractor.__init__(self, match) - self.session.cookies.set("isAdult", "1", domain="www.mangahere.cc") + self.cookies.set("isAdult", "1", domain="www.mangahere.cc") def chapters(self, page): results = [] diff --git a/gallery_dl/extractor/mangasee.py b/gallery_dl/extractor/mangasee.py index b7070f285e..dfa9bdf010 100644 --- a/gallery_dl/extractor/mangasee.py +++ b/gallery_dl/extractor/mangasee.py @@ -93,7 +93,7 @@ def __init__(self, match): self.session.headers["Referer"] = self.gallery_url domain = self.root.rpartition("/")[2] - cookies = self.session.cookies + cookies = self.cookies if not cookies.get("PHPSESSID", domain=domain): cookies.set("PHPSESSID", util.generate_token(13), domain=domain) diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py index ac4c7978a2..cca18b135f 100644 --- a/gallery_dl/extractor/mangoxo.py +++ b/gallery_dl/extractor/mangoxo.py @@ -19,14 +19,14 @@ class MangoxoExtractor(Extractor): """Base class for mangoxo extractors""" category = "mangoxo" root = "https://www.mangoxo.com" - cookiedomain = "www.mangoxo.com" - cookienames = ("SESSION",) + cookies_domain = "www.mangoxo.com" + cookies_names = ("SESSION",) _warning = True def login(self): username, password = self._get_auth_info() if username: - self._update_cookies(self._login_impl(username, password)) + self.cookies_update(self._login_impl(username, password)) elif MangoxoExtractor._warning: MangoxoExtractor._warning = False self.log.warning("Unauthenticated users cannot see " @@ -51,7 +51,7 @@ def _login_impl(self, username, password): data = response.json() if str(data.get("result")) != "1": raise exception.AuthenticationError(data.get("msg")) - return {"SESSION": self.session.cookies.get("SESSION")} + return {"SESSION": self.cookies.get("SESSION")} @staticmethod def _sign_by_md5(username, password, token): diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index e190c7eb0d..ddd34f0d06 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -19,7 +19,7 @@ class MastodonExtractor(BaseExtractor): directory_fmt = ("mastodon", "{instance}", "{account[username]}") filename_fmt = "{category}_{id}_{media[id]}.{extension}" archive_fmt = "{media[id]}" - cookiedomain = None + cookies_domain = None def __init__(self, match): BaseExtractor.__init__(self, match) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index e047f3df2a..e3ea3fc9fe 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -21,8 +21,8 @@ class NewgroundsExtractor(Extractor): filename_fmt = "{category}_{_index}_{title}.{extension}" archive_fmt = "{_type}{_index}" root = "https://www.newgrounds.com" - cookiedomain = ".newgrounds.com" - cookienames = ("NG_GG_username", "vmk1du5I8m") + cookies_domain = ".newgrounds.com" + cookies_names = ("NG_GG_username", "vmk1du5I8m") request_interval = 1.0 def __init__(self, match): @@ -72,11 +72,12 @@ def metadata(self): """Return general metadata""" def login(self): - if self._check_cookies(self.cookienames): + if self.cookies_check(self.cookies_names): return + username, password = self._get_auth_info() if username: - self._update_cookies(self._login_impl(username, password)) + self.cookies_update(self._login_impl(username, password)) @cache(maxage=360*24*3600, keyarg=1) def _login_impl(self, username, password): @@ -85,7 +86,7 @@ def _login_impl(self, username, password): url = self.root + "/passport/" response = self.request(url) if response.history and response.url.endswith("/social"): - return self.session.cookies + return self.cookies page = response.text headers = {"Origin": self.root, "Referer": url} @@ -105,7 +106,7 @@ def _login_impl(self, username, password): return { cookie.name: cookie.value for cookie in response.history[0].cookies - if cookie.expires and cookie.domain == self.cookiedomain + if cookie.expires and cookie.domain == self.cookies_domain } def extract_post(self, post_url): diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 079bae7677..e822895b10 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -22,8 +22,8 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): def __init__(self, match): self._init_category(match) - self.cookiedomain = "." + self.root.rpartition("/")[2] - self.cookienames = (self.category + "_tok",) + self.cookies_domain = "." + self.root.rpartition("/")[2] + self.cookies_names = (self.category + "_tok",) if self.category == "horne": self._extract_data = self._extract_data_horne @@ -121,10 +121,11 @@ def _extract_user_name(page): return text.unescape(text.extr(page, "<br />", "<")) def login(self): - """Login and obtain session cookies""" - if not self._check_cookies(self.cookienames): - username, password = self._get_auth_info() - self._update_cookies(self._login_impl(username, password)) + if self.cookies_check(self.cookies_names): + return + + username, password = self._get_auth_info() + self.cookies_update(self._login_impl(username, password)) @cache(maxage=90*24*3600, keyarg=1) def _login_impl(self, username, password): @@ -139,7 +140,7 @@ def _login_impl(self, username, password): response = self.request(url, method="POST", data=data) if "/login.php" in response.text: raise exception.AuthenticationError() - return self.session.cookies + return self.cookies def _pagination(self, path): url = "{}/{}.php".format(self.root, path) @@ -172,7 +173,7 @@ def _pagination(self, path): class NijieUserExtractor(NijieExtractor): """Extractor for nijie user profiles""" subcategory = "user" - cookiedomain = None + cookies_domain = None pattern = BASE_PATTERN + r"/members\.php\?id=(\d+)" test = ( ("https://nijie.info/members.php?id=44"), diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index beb3da2557..fda169d832 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -21,7 +21,7 @@ class NitterExtractor(BaseExtractor): archive_fmt = "{tweet_id}_{num}" def __init__(self, match): - self.cookiedomain = self.root.partition("://")[2] + self.cookies_domain = self.root.partition("://")[2] BaseExtractor.__init__(self, match) lastindex = match.lastindex @@ -35,7 +35,7 @@ def items(self): if videos: ytdl = (videos == "ytdl") videos = True - self._cookiejar.set("hlsPlayback", "on", domain=self.cookiedomain) + self.cookies.set("hlsPlayback", "on", domain=self.cookies_domain) for tweet in self.tweets(): diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 1fa571c43b..7bccf8386c 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -21,7 +21,7 @@ class PahealExtractor(Extractor): root = "https://rule34.paheal.net" def items(self): - self.session.cookies.set( + self.cookies.set( "ui-tnc-agreed", "true", domain="rule34.paheal.net") data = self.get_metadata() diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index e4bfa2a836..99d9457ae7 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -19,7 +19,7 @@ class PatreonExtractor(Extractor): """Base class for patreon extractors""" category = "patreon" root = "https://www.patreon.com" - cookiedomain = ".patreon.com" + cookies_domain = ".patreon.com" directory_fmt = ("{category}", "{creator[full_name]}") filename_fmt = "{id}_{title}_{num:>02}.{extension}" archive_fmt = "{id}_{num}" @@ -28,11 +28,11 @@ class PatreonExtractor(Extractor): _warning = True def items(self): - if self._warning: - if not self._check_cookies(("session_id",)): + if not self.cookies_check(("session_id",)): self.log.warning("no 'session_id' cookie set") PatreonExtractor._warning = False + generators = self._build_file_generators(self.config("files")) for post in self.posts(): diff --git a/gallery_dl/extractor/pillowfort.py b/gallery_dl/extractor/pillowfort.py index 841a99bd71..af7d57f134 100644 --- a/gallery_dl/extractor/pillowfort.py +++ b/gallery_dl/extractor/pillowfort.py @@ -24,7 +24,7 @@ class PillowfortExtractor(Extractor): filename_fmt = ("{post_id} {title|original_post[title]:?/ /}" "{num:>02}.{extension}") archive_fmt = "{id}" - cookiedomain = "www.pillowfort.social" + cookies_domain = "www.pillowfort.social" def __init__(self, match): Extractor.__init__(self, match) @@ -82,15 +82,14 @@ def items(self): yield msgtype, url, post def login(self): - cget = self.session.cookies.get - if cget("_Pf_new_session", domain=self.cookiedomain) \ - or cget("remember_user_token", domain=self.cookiedomain): + if self.cookies.get("_Pf_new_session", domain=self.cookies_domain): + return + if self.cookies.get("remember_user_token", domain=self.cookies_domain): return username, password = self._get_auth_info() if username: - cookies = self._login_impl(username, password) - self._update_cookies(cookies) + self.cookies_update(self._login_impl(username, password)) @cache(maxage=14*24*3600, keyarg=1) def _login_impl(self, username, password): diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 861959e456..8b77de4779 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -26,7 +26,7 @@ class PixivExtractor(Extractor): directory_fmt = ("{category}", "{user[id]} {user[account]}") filename_fmt = "{id}_p{num}.{extension}" archive_fmt = "{id}{suffix}.{extension}" - cookiedomain = None + cookies_domain = None def __init__(self, match): Extractor.__init__(self, match) @@ -971,7 +971,7 @@ class PixivSketchExtractor(Extractor): filename_fmt = "{post_id} {id}.{extension}" archive_fmt = "S{user[id]}_{id}" root = "https://sketch.pixiv.net" - cookiedomain = ".pixiv.net" + cookies_domain = ".pixiv.net" pattern = r"(?:https?://)?sketch\.pixiv\.net/@([^/?#]+)" test = ("https://sketch.pixiv.net/@nicoby", { "pattern": r"https://img\-sketch\.pixiv\.net/uploads/medium" diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index fa4efa027c..0b734a77b7 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -58,7 +58,7 @@ def __init__(self, match): self._first = None def items(self): - self.session.cookies.set( + self.cookies.set( "accessAgeDisclaimerPH", "1", domain=".pornhub.com") data = self.metadata() diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 54b162b37b..05da7f4a44 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -19,7 +19,7 @@ class RedditExtractor(Extractor): directory_fmt = ("{category}", "{subreddit}") filename_fmt = "{id}{num:? //>02} {title[:220]}.{extension}" archive_fmt = "{filename}" - cookiedomain = ".reddit.com" + cookies_domain = ".reddit.com" request_interval = 0.6 def items(self): @@ -399,9 +399,9 @@ def __init__(self, extractor): if not self.refresh_token: # allow downloading from quarantined subreddits (#2180) - extractor._cookiejar.set( + extractor.cookies.set( "_options", '%7B%22pref_quarantine_optin%22%3A%20true%7D', - domain=extractor.cookiedomain) + domain=extractor.cookies_domain) def submission(self, submission_id): """Fetch the (submission, comments)=-tuple for a submission id""" diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 09e5421d88..ae25718cbf 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -25,7 +25,7 @@ class SankakuExtractor(BooruExtractor): basecategory = "booru" category = "sankaku" filename_fmt = "{category}_{id}_{md5}.{extension}" - cookiedomain = None + cookies_domain = None _warning = True TAG_TYPES = { diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py index 711435ef91..ab4661496e 100644 --- a/gallery_dl/extractor/seiga.py +++ b/gallery_dl/extractor/seiga.py @@ -16,7 +16,7 @@ class SeigaExtractor(Extractor): """Base class for seiga extractors""" category = "seiga" archive_fmt = "{image_id}" - cookiedomain = ".nicovideo.jp" + cookies_domain = ".nicovideo.jp" root = "https://seiga.nicovideo.jp" def __init__(self, match): @@ -24,7 +24,7 @@ def __init__(self, match): self.start_image = 0 def items(self): - if not self._check_cookies(("user_session",)): + if not self.cookies_check(("user_session",)): raise exception.StopExtraction("'user_session' cookie required") images = iter(self.get_images()) @@ -186,7 +186,7 @@ def skip(self, num): return num def get_images(self): - self.session.cookies.set( + self.cookies.set( "skip_fetish_warning", "1", domain="seiga.nicovideo.jp") url = "{}/seiga/im{}".format(self.root, self.image_id) diff --git a/gallery_dl/extractor/senmanga.py b/gallery_dl/extractor/senmanga.py index 92c9d2cbb5..b3b27462b4 100644 --- a/gallery_dl/extractor/senmanga.py +++ b/gallery_dl/extractor/senmanga.py @@ -71,7 +71,7 @@ def __init__(self, match): self.session.headers["Referer"] = self.gallery_url # select "All pages" viewer - self.session.cookies.set( + self.cookies.set( "viewer", "1", domain="raw.senmanga.com") def metadata(self, page): diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py index 285cd8fed6..b0dd9bbdea 100644 --- a/gallery_dl/extractor/shimmie2.py +++ b/gallery_dl/extractor/shimmie2.py @@ -29,7 +29,7 @@ def __init__(self, match): cookies = instance.get("cookies") if cookies: domain = self.root.rpartition("/")[2] - self._update_cookies_dict(cookies, domain=domain) + self.cookies_update_dict(cookies, domain=domain) file_url = instance.get("file_url") if file_url: self.file_url_fmt = file_url diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 713d4c41c1..e30c491156 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -21,7 +21,7 @@ class SmugmugExtractor(Extractor): category = "smugmug" filename_fmt = ("{category}_{User[NickName]:?/_/}" "{Image[UploadKey]}_{Image[ImageKey]}.{extension}") - cookiedomain = None + cookies_domain = None empty_user = { "Uri": "", "ResponseLevel": "Public", diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 4de7e9b59c..a2e1388c14 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -22,14 +22,14 @@ class SubscribestarExtractor(Extractor): directory_fmt = ("{category}", "{author_name}") filename_fmt = "{post_id}_{id}.{extension}" archive_fmt = "{id}" - cookiedomain = "www.subscribestar.com" - cookienames = ("auth_token",) + cookies_domain = "www.subscribestar.com" + cookies_names = ("auth_token",) def __init__(self, match): tld, self.item = match.groups() if tld == "adult": self.root = "https://subscribestar.adult" - self.cookiedomain = "subscribestar.adult" + self.cookies_domain = "subscribestar.adult" self.subcategory += "-adult" Extractor.__init__(self, match) @@ -49,12 +49,12 @@ def posts(self): """Yield HTML content of all relevant posts""" def login(self): - if self._check_cookies(self.cookienames): + if self.cookies_check(self.cookies_names): return + username, password = self._get_auth_info() if username: - cookies = self._login_impl(username, password) - self._update_cookies(cookies) + self.cookies_update(self._login_impl(username, password)) @cache(maxage=28*24*3600, keyarg=1) def _login_impl(self, username, password): diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py index 545a95bb69..ec4a249c33 100644 --- a/gallery_dl/extractor/tapas.py +++ b/gallery_dl/extractor/tapas.py @@ -22,8 +22,8 @@ class TapasExtractor(Extractor): directory_fmt = ("{category}", "{series[title]}", "{id} {title}") filename_fmt = "{num:>02}.{extension}" archive_fmt = "{id}_{num}" - cookiedomain = ".tapas.io" - cookienames = ("_cpc_",) + cookies_domain = ".tapas.io" + cookies_names = ("_cpc_",) _cache = None def __init__(self, match): @@ -70,14 +70,17 @@ def items(self): yield Message.Url, url, text.nameext_from_url(url, episode) def login(self): - if not self._check_cookies(self.cookienames): - username, password = self._get_auth_info() - if username: - self._update_cookies(self._login_impl(username, password)) - else: - sc = self.session.cookies.set - sc("birthDate" , "1981-02-03", domain=self.cookiedomain) - sc("adjustedBirthDate", "1981-02-03", domain=self.cookiedomain) + if self.cookies_check(self.cookies_names): + return + + username, password = self._get_auth_info() + if username: + return self.cookies_update(self._login_impl(username, password)) + + self.cookies.set( + "birthDate" , "1981-02-03", domain=self.cookies_domain) + self.cookies.set( + "adjustedBirthDate", "1981-02-03", domain=self.cookies_domain) @cache(maxage=14*24*3600, keyarg=1) def _login_impl(self, username, password): diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py index 92bd634796..e7d5226a9f 100644 --- a/gallery_dl/extractor/tsumino.py +++ b/gallery_dl/extractor/tsumino.py @@ -16,15 +16,15 @@ class TsuminoBase(): """Base class for tsumino extractors""" category = "tsumino" - cookiedomain = "www.tsumino.com" + cookies_domain = "www.tsumino.com" root = "https://www.tsumino.com" def login(self): username, password = self._get_auth_info() if username: - self._update_cookies(self._login_impl(username, password)) + self.cookies_update(self._login_impl(username, password)) else: - self.session.cookies.setdefault( + self.cookies.setdefault( "ASP.NET_SessionId", "x1drgggilez4cpkttneukrc5") @cache(maxage=14*24*3600, keyarg=1) @@ -37,7 +37,7 @@ def _login_impl(self, username, password): response = self.request(url, method="POST", headers=headers, data=data) if not response.history: raise exception.AuthenticationError() - return self.session.cookies + return self.cookies class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor): diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index b45609d729..f42da48865 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -31,7 +31,7 @@ class TumblrExtractor(Extractor): directory_fmt = ("{category}", "{blog_name}") filename_fmt = "{category}_{blog_name}_{id}_{num:>02}.{extension}" archive_fmt = "{id}_{num}" - cookiedomain = None + cookies_domain = None def __init__(self, match): Extractor.__init__(self, match) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 092ddb4914..7e42079974 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -24,8 +24,8 @@ class TwitterExtractor(Extractor): directory_fmt = ("{category}", "{user[name]}") filename_fmt = "{tweet_id}_{num}.{extension}" archive_fmt = "{tweet_id}_{retweet_id}_{num}" - cookiedomain = ".twitter.com" - cookienames = ("auth_token",) + cookies_domain = ".twitter.com" + cookies_names = ("auth_token",) root = "https://twitter.com" browser = "firefox" @@ -455,10 +455,12 @@ def tweets(self): """Yield all relevant tweet objects""" def login(self): - if not self._check_cookies(self.cookienames): - username, password = self._get_auth_info() - if username: - self._update_cookies(_login_impl(self, username, password)) + if self.cookies_check(self.cookies_names): + return + + username, password = self._get_auth_info() + if username: + self.cookies_update(_login_impl(self, username, password)) class TwitterUserExtractor(TwitterExtractor): @@ -1121,19 +1123,19 @@ def __init__(self, extractor): self._syndication = self.extractor.syndication self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode - cookies = extractor.session.cookies - cookiedomain = extractor.cookiedomain + cookies = extractor.cookies + cookies_domain = extractor.cookies_domain csrf = extractor.config("csrf") if csrf is None or csrf == "cookies": - csrf_token = cookies.get("ct0", domain=cookiedomain) + csrf_token = cookies.get("ct0", domain=cookies_domain) else: csrf_token = None if not csrf_token: csrf_token = util.generate_token() - cookies.set("ct0", csrf_token, domain=cookiedomain) + cookies.set("ct0", csrf_token, domain=cookies_domain) - auth_token = cookies.get("auth_token", domain=cookiedomain) + auth_token = cookies.get("auth_token", domain=cookies_domain) self.headers = { "Accept": "*/*", @@ -1489,8 +1491,8 @@ def _authenticate_guest(self): guest_token = self._guest_token() if guest_token != self.headers["x-guest-token"]: self.headers["x-guest-token"] = guest_token - self.extractor.session.cookies.set( - "gt", guest_token, domain=self.extractor.cookiedomain) + self.extractor.cookies.set( + "gt", guest_token, domain=self.extractor.cookies_domain) def _call(self, endpoint, params, method="GET", auth=True, root=None): url = (root or self.root) + endpoint @@ -1683,8 +1685,8 @@ def _pagination_tweets(self, endpoint, variables, if user.get("blocked_by"): if self.headers["x-twitter-auth-type"] and \ extr.config("logout"): - extr._cookiefile = None - del extr.session.cookies["auth_token"] + extr.cookies_file = None + del extr.cookies["auth_token"] self.headers["x-twitter-auth-type"] = None extr.log.info("Retrying API request as guest") continue @@ -1938,7 +1940,7 @@ def process(response): extr.log.debug(response.text) raise exception.AuthenticationError(", ".join(errors)) - extr.session.cookies.clear() + extr.cookies.clear() api = TwitterAPI(extr) api._authenticate_guest() headers = api.headers @@ -2078,5 +2080,5 @@ def process(response): return { cookie.name: cookie.value - for cookie in extr.session.cookies + for cookie in extr.cookies } diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index 6dff01c824..d8aa6cdb6d 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -23,8 +23,8 @@ class VipergirlsExtractor(Extractor): root = "https://vipergirls.to" request_interval = 0.5 request_interval_min = 0.2 - cookiedomain = ".vipergirls.to" - cookienames = ("vg_userid", "vg_password") + cookies_domain = ".vipergirls.to" + cookies_names = ("vg_userid", "vg_password") def __init__(self, match): Extractor.__init__(self, match) @@ -42,10 +42,12 @@ def items(self): yield Message.Queue, image.attrib["main_url"], data def login(self): - if not self._check_cookies(self.cookienames): - username, password = self._get_auth_info() - if username: - self._update_cookies(self._login_impl(username, password)) + if self.cookies_check(self.cookies_names): + return + + username, password = self._get_auth_info() + if username: + self.cookies_update(self._login_impl(username, password)) @cache(maxage=90*24*3600, keyarg=1) def _login_impl(self, username, password): diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 21f7c21ebf..7b3e8033f2 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -18,10 +18,10 @@ class WebtoonsBase(): category = "webtoons" root = "https://www.webtoons.com" - cookiedomain = ".webtoons.com" + cookies_domain = ".webtoons.com" def setup_agegate_cookies(self): - self._update_cookies({ + self.cookies_update({ "atGDPR" : "AD_CONSENT", "needCCPA" : "false", "needCOPPA" : "false", diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 5a3adc80b4..2de7a2fc2a 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -34,7 +34,7 @@ def __init__(self, match): cookies = _cookie_cache() if cookies is not None: - self.session.cookies.update(cookies) + self.cookies.update(cookies) self.session.headers["Referer"] = self.root + "/" def request(self, url, **kwargs): diff --git a/gallery_dl/extractor/ytdl.py b/gallery_dl/extractor/ytdl.py index b3a16521c1..7f3c8de78c 100644 --- a/gallery_dl/extractor/ytdl.py +++ b/gallery_dl/extractor/ytdl.py @@ -76,7 +76,7 @@ def items(self): ytdl_module, self, user_opts, extr_opts) # transfer cookies to ytdl - cookies = self.session.cookies + cookies = self.cookies if cookies: set_cookie = ytdl_instance.cookiejar.set_cookie for cookie in cookies: diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 148b92afc9..8187db886e 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -21,17 +21,19 @@ class ZerochanExtractor(BooruExtractor): root = "https://www.zerochan.net" filename_fmt = "{id}.{extension}" archive_fmt = "{id}" - cookiedomain = ".zerochan.net" - cookienames = ("z_id", "z_hash") + cookies_domain = ".zerochan.net" + cookies_names = ("z_id", "z_hash") def login(self): self._logged_in = True - if not self._check_cookies(self.cookienames): - username, password = self._get_auth_info() - if username: - self._update_cookies(self._login_impl(username, password)) - else: - self._logged_in = False + if self.cookies_check(self.cookies_names): + return + + username, password = self._get_auth_info() + if username: + return self.cookies_update(self._login_impl(username, password)) + + self._logged_in = False @cache(maxage=90*86400, keyarg=1) def _login_impl(self, username, password): diff --git a/gallery_dl/job.py b/gallery_dl/job.py index ca5785d978..7ecdc39134 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -378,7 +378,7 @@ def handle_finalize(self): for callback in hooks["post-after"]: callback(pathfmt) - self.extractor._store_cookies() + self.extractor.cookies_store() if "finalize" in hooks: status = self.status for callback in hooks["finalize"]: diff --git a/test/test_cookies.py b/test/test_cookies.py index 335fa3dd79..5a4fbe65c0 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2017-2022 Mike Fährmann +# Copyright 2017-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -47,7 +47,7 @@ def tearDownClass(cls): def test_cookiefile(self): config.set((), "cookies", self.cookiefile) - cookies = extractor.find("test:").session.cookies + cookies = extractor.find("test:").cookies self.assertEqual(len(cookies), 1) cookie = next(iter(cookies)) @@ -66,7 +66,7 @@ def _test_warning(self, filename, exc): config.set((), "cookies", filename) log = logging.getLogger("test") with mock.patch.object(log, "warning") as mock_warning: - cookies = extractor.find("test:").session.cookies + cookies = extractor.find("test:").cookies self.assertEqual(len(cookies), 0) self.assertEqual(mock_warning.call_count, 1) self.assertEqual(mock_warning.call_args[0][0], "cookies: %s") @@ -83,7 +83,7 @@ def tearDown(self): config.clear() def test_dict(self): - cookies = extractor.find("test:").session.cookies + cookies = extractor.find("test:").cookies self.assertEqual(len(cookies), len(self.cdict)) self.assertEqual(sorted(cookies.keys()), sorted(self.cdict.keys())) self.assertEqual(sorted(cookies.values()), sorted(self.cdict.values())) @@ -91,11 +91,11 @@ def test_dict(self): def test_domain(self): for category in ["exhentai", "idolcomplex", "nijie", "horne"]: extr = _get_extractor(category) - cookies = extr.session.cookies + cookies = extr.cookies for key in self.cdict: self.assertTrue(key in cookies) for c in cookies: - self.assertEqual(c.domain, extr.cookiedomain) + self.assertEqual(c.domain, extr.cookies_domain) class TestCookieLogin(unittest.TestCase): @@ -123,79 +123,79 @@ class TestCookieUtils(unittest.TestCase): def test_check_cookies(self): extr = extractor.find("test:") - self.assertFalse(extr._cookiejar, "empty") - self.assertFalse(extr.cookiedomain, "empty") + self.assertFalse(extr.cookies, "empty") + self.assertFalse(extr.cookies_domain, "empty") # always returns False when checking for empty cookie list - self.assertFalse(extr._check_cookies(())) + self.assertFalse(extr.cookies_check(())) - self.assertFalse(extr._check_cookies(("a",))) - self.assertFalse(extr._check_cookies(("a", "b"))) - self.assertFalse(extr._check_cookies(("a", "b", "c"))) + self.assertFalse(extr.cookies_check(("a",))) + self.assertFalse(extr.cookies_check(("a", "b"))) + self.assertFalse(extr.cookies_check(("a", "b", "c"))) - extr._cookiejar.set("a", "1") - self.assertTrue(extr._check_cookies(("a",))) - self.assertFalse(extr._check_cookies(("a", "b"))) - self.assertFalse(extr._check_cookies(("a", "b", "c"))) + extr.cookies.set("a", "1") + self.assertTrue(extr.cookies_check(("a",))) + self.assertFalse(extr.cookies_check(("a", "b"))) + self.assertFalse(extr.cookies_check(("a", "b", "c"))) - extr._cookiejar.set("b", "2") - self.assertTrue(extr._check_cookies(("a",))) - self.assertTrue(extr._check_cookies(("a", "b"))) - self.assertFalse(extr._check_cookies(("a", "b", "c"))) + extr.cookies.set("b", "2") + self.assertTrue(extr.cookies_check(("a",))) + self.assertTrue(extr.cookies_check(("a", "b"))) + self.assertFalse(extr.cookies_check(("a", "b", "c"))) def test_check_cookies_domain(self): extr = extractor.find("test:") - self.assertFalse(extr._cookiejar, "empty") - extr.cookiedomain = ".example.org" + self.assertFalse(extr.cookies, "empty") + extr.cookies_domain = ".example.org" - self.assertFalse(extr._check_cookies(("a",))) - self.assertFalse(extr._check_cookies(("a", "b"))) + self.assertFalse(extr.cookies_check(("a",))) + self.assertFalse(extr.cookies_check(("a", "b"))) - extr._cookiejar.set("a", "1") - self.assertFalse(extr._check_cookies(("a",))) + extr.cookies.set("a", "1") + self.assertFalse(extr.cookies_check(("a",))) - extr._cookiejar.set("a", "1", domain=extr.cookiedomain) - self.assertTrue(extr._check_cookies(("a",))) + extr.cookies.set("a", "1", domain=extr.cookies_domain) + self.assertTrue(extr.cookies_check(("a",))) - extr._cookiejar.set("a", "1", domain="www" + extr.cookiedomain) - self.assertEqual(len(extr._cookiejar), 3) - self.assertTrue(extr._check_cookies(("a",))) + extr.cookies.set("a", "1", domain="www" + extr.cookies_domain) + self.assertEqual(len(extr.cookies), 3) + self.assertTrue(extr.cookies_check(("a",))) - extr._cookiejar.set("b", "2", domain=extr.cookiedomain) - extr._cookiejar.set("c", "3", domain=extr.cookiedomain) - self.assertTrue(extr._check_cookies(("a", "b", "c"))) + extr.cookies.set("b", "2", domain=extr.cookies_domain) + extr.cookies.set("c", "3", domain=extr.cookies_domain) + self.assertTrue(extr.cookies_check(("a", "b", "c"))) def test_check_cookies_expires(self): extr = extractor.find("test:") - self.assertFalse(extr._cookiejar, "empty") - self.assertFalse(extr.cookiedomain, "empty") + self.assertFalse(extr.cookies, "empty") + self.assertFalse(extr.cookies_domain, "empty") now = int(time.time()) log = logging.getLogger("test") - extr._cookiejar.set("a", "1", expires=now-100) + extr.cookies.set("a", "1", expires=now-100) with mock.patch.object(log, "warning") as mw: - self.assertFalse(extr._check_cookies(("a",))) + self.assertFalse(extr.cookies_check(("a",))) self.assertEqual(mw.call_count, 1) self.assertEqual(mw.call_args[0], ("Cookie '%s' has expired", "a")) - extr._cookiejar.set("a", "1", expires=now+100) + extr.cookies.set("a", "1", expires=now+100) with mock.patch.object(log, "warning") as mw: - self.assertTrue(extr._check_cookies(("a",))) + self.assertTrue(extr.cookies_check(("a",))) self.assertEqual(mw.call_count, 1) self.assertEqual(mw.call_args[0], ( "Cookie '%s' will expire in less than %s hour%s", "a", 1, "")) - extr._cookiejar.set("a", "1", expires=now+100+7200) + extr.cookies.set("a", "1", expires=now+100+7200) with mock.patch.object(log, "warning") as mw: - self.assertTrue(extr._check_cookies(("a",))) + self.assertTrue(extr.cookies_check(("a",))) self.assertEqual(mw.call_count, 1) self.assertEqual(mw.call_args[0], ( "Cookie '%s' will expire in less than %s hour%s", "a", 3, "s")) - extr._cookiejar.set("a", "1", expires=now+100+24*3600) + extr.cookies.set("a", "1", expires=now+100+24*3600) with mock.patch.object(log, "warning") as mw: - self.assertTrue(extr._check_cookies(("a",))) + self.assertTrue(extr.cookies_check(("a",))) self.assertEqual(mw.call_count, 0) From 088e8d5fcf3751a9f71232d1d85776c584ad528f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 22 Jul 2023 14:05:40 +0200 Subject: [PATCH 231/252] [pornhub] fix extraction (#4301) --- gallery_dl/extractor/pornhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index 0b734a77b7..d3619da695 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -111,7 +111,7 @@ def images(self): "views" : text.parse_int(img["times_viewed"]), "score" : text.parse_int(img["vote_percent"]), } - key = img["next"] + key = str(img["next"]) if key == end: return From a45a17ddb7504541907772ac330ff278a0f20878 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 22 Jul 2023 14:38:26 +0200 Subject: [PATCH 232/252] [pixiv] ignore 'limit_sanity_level' images (#4328) --- gallery_dl/extractor/pixiv.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 8b77de4779..3cc59acfff 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -47,6 +47,8 @@ def transform_tags(work): def transform_tags(work): work["tags"] = [tag["name"] for tag in work["tags"]] + url_sanity = ("https://s.pximg.net/common/images" + "/limit_sanity_level_360.png") ratings = {0: "General", 1: "R-18", 2: "R-18G"} meta_user = self.config("metadata") meta_bookmark = self.config("metadata-bookmark") @@ -102,6 +104,10 @@ def transform_tags(work): elif work["page_count"] == 1: url = meta_single_page["original_image_url"] + if url == url_sanity: + self.log.debug("Skipping 'sanity_level' warning (%s)", + work["id"]) + continue work["date_url"] = self._date_from_url(url) yield Message.Url, url, text.nameext_from_url(url, work) From 7da954f810799f41f4d4dc8efd32b122d47ed903 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 22 Jul 2023 15:38:33 +0200 Subject: [PATCH 233/252] [flickr] update default API credentials (#4332) and add a delay between API requests --- gallery_dl/extractor/flickr.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index 9f97a3316b..cb7d1e8123 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -21,6 +21,8 @@ class FlickrExtractor(Extractor): directory_fmt = ("{category}", "{user[username]}") archive_fmt = "{id}" cookies_domain = None + request_interval = (1.0, 2.0) + request_interval_min = 0.2 def __init__(self, match): Extractor.__init__(self, match) @@ -289,8 +291,8 @@ class FlickrAPI(oauth.OAuth1API): """ API_URL = "https://api.flickr.com/services/rest/" - API_KEY = "ac4fd7aa98585b9eee1ba761c209de68" - API_SECRET = "3adb0f568dc68393" + API_KEY = "f8f78d1a40debf471f0b22fa2d00525f" + API_SECRET = "4f9dae1113e45556" FORMATS = [ ("o" , "Original" , None), ("6k", "X-Large 6K" , 6144), From 1baf83a9e52c2b1d09b32d5e115aebf8e5f2d279 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 22 Jul 2023 16:15:55 +0200 Subject: [PATCH 234/252] [hiperdex] fix for unicode titles (#4325) --- gallery_dl/extractor/hiperdex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index 3aad88cb42..22fe7f6c01 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -153,7 +153,7 @@ def chapters(self, page): "Accept": "*/*", "X-Requested-With": "XMLHttpRequest", "Origin": self.root, - "Referer": self.manga_url, + "Referer": "https://" + text.quote(self.manga_url[8:]), } html = self.request(url, method="POST", headers=headers).text From 54d974deb05f941ee7adc24d550bc7e98eb0bb0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 24 Jul 2023 12:26:40 +0200 Subject: [PATCH 235/252] add 'python' post processor similar to 'exec' but calls a Python function --- docs/configuration.rst | 46 +++++++++++++++++++++++++++ gallery_dl/postprocessor/__init__.py | 3 +- gallery_dl/postprocessor/python.py | 47 ++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 gallery_dl/postprocessor/python.py diff --git a/docs/configuration.rst b/docs/configuration.rst index fc531632b2..7880bf5e13 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -4748,6 +4748,50 @@ Description |datetime|_ object. +python.archive +-------------- +Type + |Path|_ +Description + File to store IDs of called Python functions in, + similar to `extractor.*.archive`_. + + ``archive-format``, ``archive-prefix``, and ``archive-pragma`` options, + akin to + `extractor.*.archive-format`_, + `extractor.*.archive-prefix`_, and + `extractor.*.archive-pragma`_, are supported as well. + + +python.event +------------ +Type + ``string`` +Default + ``"file"`` +Description + The event for which `python.function`_ gets called. + + See `metadata.event`_ for a list of available events. + + +python.function +--------------- +Type + ``string`` +Example + * ``"my_module:generate_text"`` + * ``"~/.local/share/gdl-utils.py:resize"`` +Description + The Python function to call. + + This function gets specified as ``<module>:<function name>`` + and gets called with the current metadata dict as argument. + + ``module`` is either an importable Python module name + or the |Path|_ to a `.py` file, + + ugoira.extension ---------------- Type @@ -5345,6 +5389,8 @@ Description Write metadata to separate files ``mtime`` Set file modification time according to its metadata + ``python`` + Call Python functions ``ugoira`` Convert Pixiv Ugoira to WebM using `FFmpeg <https://www.ffmpeg.org/>`__ ``zip`` diff --git a/gallery_dl/postprocessor/__init__.py b/gallery_dl/postprocessor/__init__.py index ee490e79a2..46905547b4 100644 --- a/gallery_dl/postprocessor/__init__.py +++ b/gallery_dl/postprocessor/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2021 Mike Fährmann +# Copyright 2018-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -14,6 +14,7 @@ "exec", "metadata", "mtime", + "python", "ugoira", "zip", ] diff --git a/gallery_dl/postprocessor/python.py b/gallery_dl/postprocessor/python.py new file mode 100644 index 0000000000..dd7281081c --- /dev/null +++ b/gallery_dl/postprocessor/python.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Run Python functions""" + +from .common import PostProcessor +from .. import util + + +class PythonPP(PostProcessor): + + def __init__(self, job, options): + PostProcessor.__init__(self, job) + + spec = options["function"] + module_name, _, function_name = spec.rpartition(":") + module = util.import_file(module_name) + self.function = getattr(module, function_name) + + self._init_archive(job, options) + if self.archive: + self.run = self.run_archive + + events = options.get("event") + if events is None: + events = ("file",) + elif isinstance(events, str): + events = events.split(",") + job.register_hooks({event: self.run for event in events}, options) + + def run(self, pathfmt): + self.function(pathfmt.kwdict) + + def run_archive(self, pathfmt): + kwdict = pathfmt.kwdict + if self.archive.check(kwdict): + return + self.function(kwdict) + self.archive.add(kwdict) + + +__postprocessor__ = PythonPP From 7cd5130e884287db88101025cfa551e2b689ca4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 24 Jul 2023 14:27:37 +0200 Subject: [PATCH 236/252] [docs] small fixes/updates --- docs/configuration.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 7880bf5e13..9bfc1e7304 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -442,7 +442,7 @@ Description "isAdult" : "1" } - * A ``list`` with up to 4 entries specifying a browser profile. + * A ``list`` with up to 5 entries specifying a browser profile. * The first entry is the browser name * The optional second entry is a profile name or an absolute path to a profile directory @@ -3277,8 +3277,8 @@ Type Default ``"auto"`` Description - Controls the strategy / tweet source used for user URLs - (``https://twitter.com/USER``). + Controls the strategy / tweet source used for timeline URLs + (``https://twitter.com/USER/timeline``). * ``"tweets"``: `/tweets <https://twitter.com/USER/tweets>`__ timeline + search * ``"media"``: `/media <https://twitter.com/USER/media>`__ timeline + search @@ -4726,7 +4726,7 @@ Default Description Name of the metadata field whose value should be used. - This value must either be a UNIX timestamp or a + This value must be either a UNIX timestamp or a |datetime|_ object. Note: This option gets ignored if `mtime.value`_ is set. @@ -4744,7 +4744,7 @@ Example Description A `format string`_ whose value should be used. - The resulting value must either be a UNIX timestamp or a + The resulting value must be either a UNIX timestamp or a |datetime|_ object. From 6c9432165e2232e7a1346829da32116417758c66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 24 Jul 2023 14:32:25 +0200 Subject: [PATCH 237/252] add return value to 'PostProcessor._init_archive()' --- gallery_dl/postprocessor/common.py | 2 ++ gallery_dl/postprocessor/python.py | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py index c28d060db8..10d9fbab8c 100644 --- a/gallery_dl/postprocessor/common.py +++ b/gallery_dl/postprocessor/common.py @@ -45,5 +45,7 @@ def _init_archive(self, job, options, prefix=None): self.name, archive, exc.__class__.__name__, exc) else: self.log.debug("Using %s archive '%s'", self.name, archive) + return True else: self.archive = None + return False diff --git a/gallery_dl/postprocessor/python.py b/gallery_dl/postprocessor/python.py index dd7281081c..db71da2502 100644 --- a/gallery_dl/postprocessor/python.py +++ b/gallery_dl/postprocessor/python.py @@ -22,8 +22,7 @@ def __init__(self, job, options): module = util.import_file(module_name) self.function = getattr(module, function_name) - self._init_archive(job, options) - if self.archive: + if self._init_archive(job, options): self.run = self.run_archive events = options.get("event") From f0203b755971c1c1b4e265f7bf9c31d63b1c5523 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 24 Jul 2023 15:22:57 +0200 Subject: [PATCH 238/252] [postprocessor:python] add tests --- test/test_postprocessor.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index 554a51e82d..bcabdc8c04 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -579,6 +579,40 @@ def test_mtime_value(self): self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800) +class PythonTest(BasePostprocessorTest): + + def test_module(self): + path = os.path.join(self.dir.name, "module.py") + self._write_module(path) + + sys.path.insert(0, self.dir.name) + try: + self._create({"function": "module:calc"}, {"_value": 123}) + finally: + del sys.path[0] + + self.assertNotIn("_result", self.pathfmt.kwdict) + self._trigger() + self.assertEqual(self.pathfmt.kwdict["_result"], 246) + + def test_path(self): + path = os.path.join(self.dir.name, "module.py") + self._write_module(path) + + self._create({"function": path + ":calc"}, {"_value": 12}) + + self.assertNotIn("_result", self.pathfmt.kwdict) + self._trigger() + self.assertEqual(self.pathfmt.kwdict["_result"], 24) + + def _write_module(self, path): + with open(path, "w") as fp: + fp.write(""" +def calc(kwdict): + kwdict["_result"] = kwdict["_value"] * 2 +""") + + class ZipTest(BasePostprocessorTest): def test_zip_default(self): From a383eca7f6ec0f08dcc854d594aecbfecc4f45be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 25 Jul 2023 20:09:44 +0200 Subject: [PATCH 239/252] decouple extractor initialization Introduce an 'initialize()' function that does the actual init (session, cookies, config options) and can called separately from the constructor __init__(). This allows, for example, to adjust config access inside a Job before most of it already happened when calling 'extractor.find()'. --- gallery_dl/extractor/3dbooru.py | 12 +++--- gallery_dl/extractor/500px.py | 3 +- gallery_dl/extractor/8chan.py | 4 +- gallery_dl/extractor/artstation.py | 6 +-- gallery_dl/extractor/aryion.py | 8 +++- gallery_dl/extractor/blogger.py | 5 ++- gallery_dl/extractor/common.py | 56 ++++++++++++++++--------- gallery_dl/extractor/danbooru.py | 3 +- gallery_dl/extractor/deviantart.py | 25 +++++++---- gallery_dl/extractor/exhentai.py | 19 +++++---- gallery_dl/extractor/fanbox.py | 5 +-- gallery_dl/extractor/flickr.py | 4 +- gallery_dl/extractor/foolfuuka.py | 4 +- gallery_dl/extractor/furaffinity.py | 5 +++ gallery_dl/extractor/gelbooru_v02.py | 3 +- gallery_dl/extractor/gfycat.py | 1 + gallery_dl/extractor/hentaicosplays.py | 4 +- gallery_dl/extractor/hentaifoundry.py | 3 ++ gallery_dl/extractor/hitomi.py | 9 ++-- gallery_dl/extractor/hotleak.py | 5 +-- gallery_dl/extractor/idolcomplex.py | 2 + gallery_dl/extractor/imagebam.py | 4 +- gallery_dl/extractor/imagechest.py | 9 ++-- gallery_dl/extractor/imagefap.py | 5 +-- gallery_dl/extractor/imgur.py | 4 +- gallery_dl/extractor/inkbunny.py | 3 +- gallery_dl/extractor/instagram.py | 23 ++++++---- gallery_dl/extractor/itaku.py | 4 +- gallery_dl/extractor/kemonoparty.py | 7 ++-- gallery_dl/extractor/lolisafe.py | 3 +- gallery_dl/extractor/luscious.py | 4 +- gallery_dl/extractor/mangadex.py | 6 ++- gallery_dl/extractor/mangafox.py | 4 +- gallery_dl/extractor/mangahere.py | 4 +- gallery_dl/extractor/mangakakalot.py | 6 ++- gallery_dl/extractor/manganelo.py | 4 +- gallery_dl/extractor/mangasee.py | 2 + gallery_dl/extractor/mastodon.py | 4 +- gallery_dl/extractor/misskey.py | 4 +- gallery_dl/extractor/myhentaigallery.py | 4 +- gallery_dl/extractor/newgrounds.py | 5 +++ gallery_dl/extractor/nijie.py | 18 ++++---- gallery_dl/extractor/oauth.py | 2 + gallery_dl/extractor/paheal.py | 1 + gallery_dl/extractor/philomena.py | 3 +- gallery_dl/extractor/photobucket.py | 7 +++- gallery_dl/extractor/pinterest.py | 6 +-- gallery_dl/extractor/pixiv.py | 6 ++- gallery_dl/extractor/pornpics.py | 4 +- gallery_dl/extractor/reactor.py | 11 +++-- gallery_dl/extractor/readcomiconline.py | 4 +- gallery_dl/extractor/redgifs.py | 2 + gallery_dl/extractor/rule34us.py | 5 +-- gallery_dl/extractor/senmanga.py | 6 +-- gallery_dl/extractor/shimmie2.py | 23 +++++----- gallery_dl/extractor/simplyhentai.py | 4 +- gallery_dl/extractor/skeb.py | 2 + gallery_dl/extractor/smugmug.py | 3 +- gallery_dl/extractor/szurubooru.py | 3 +- gallery_dl/extractor/tapas.py | 3 +- gallery_dl/extractor/tumblr.py | 1 + gallery_dl/extractor/twibooru.py | 3 +- gallery_dl/extractor/twitter.py | 5 +++ gallery_dl/extractor/urlshortener.py | 1 + gallery_dl/extractor/vipergirls.py | 5 +-- gallery_dl/extractor/wallhaven.py | 6 ++- gallery_dl/extractor/weasyl.py | 3 +- gallery_dl/extractor/webtoons.py | 26 +++++++----- gallery_dl/extractor/weibo.py | 5 +++ test/test_cookies.py | 43 +++++++++++-------- test/test_downloader.py | 1 + 71 files changed, 314 insertions(+), 193 deletions(-) diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py index e0066cb9d6..e83bca7575 100644 --- a/gallery_dl/extractor/3dbooru.py +++ b/gallery_dl/extractor/3dbooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2020 Mike Fährmann +# Copyright 2015-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,12 +17,10 @@ class _3dbooruBase(): basecategory = "booru" root = "http://behoimi.org" - def __init__(self, match): - super().__init__(match) - self.session.headers.update({ - "Referer": "http://behoimi.org/post/show/", - "Accept-Encoding": "identity", - }) + def _init(self): + headers = self.session.headers + headers["Referer"] = "http://behoimi.org/post/show/" + headers["Accept-Encoding"] = "identity" class _3dbooruTagExtractor(_3dbooruBase, moebooru.MoebooruTagExtractor): diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index ac38b60406..4d1307e12d 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -23,8 +23,7 @@ class _500pxExtractor(Extractor): root = "https://500px.com" cookies_domain = ".500px.com" - def __init__(self, match): - Extractor.__init__(self, match) + def _init(self): self.session.headers["Referer"] = self.root + "/" def items(self): diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index f098008a60..2d043868f1 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2022 Mike Fährmann +# Copyright 2022-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -150,6 +150,8 @@ class _8chanBoardExtractor(_8chanExtractor): def __init__(self, match): _8chanExtractor.__init__(self, match) _, self.board, self.page = match.groups() + + def _init(self): self.session.headers["Referer"] = self.root + "/" def items(self): diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index a3a7c1e154..77d5fbd2c2 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2022 Mike Fährmann +# Copyright 2018-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -27,12 +27,12 @@ class ArtstationExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) or match.group(2) - self.external = self.config("external", False) def items(self): data = self.metadata() projects = self.projects() + external = self.config("external", False) max_posts = self.config("max-posts") if max_posts: projects = itertools.islice(projects, max_posts) @@ -45,7 +45,7 @@ def items(self): asset["num"] = num yield Message.Directory, asset - if adict["has_embedded_player"] and self.external: + if adict["has_embedded_player"] and external: player = adict["player_embedded"] url = (text.extr(player, 'src="', '"') or text.extr(player, "src='", "'")) diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index ad0f9dc672..89a8319aff 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -189,9 +189,11 @@ class AryionGalleryExtractor(AryionExtractor): def __init__(self, match): AryionExtractor.__init__(self, match) - self.recursive = self.config("recursive", True) self.offset = 0 + def _init(self): + self.recursive = self.config("recursive", True) + def skip(self, num): if self.recursive: return 0 @@ -217,9 +219,11 @@ class AryionTagExtractor(AryionExtractor): "count": ">= 5", }) - def metadata(self): + def _init(self): self.params = text.parse_query(self.user) self.user = None + + def metadata(self): return {"search_tags": self.params.get("tag")} def posts(self): diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 3ceada8d5a..b25af8f99e 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -28,12 +28,13 @@ class BloggerExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.videos = self.config("videos", True) self.blog = match.group(1) or match.group(2) + + def _init(self): self.api = BloggerAPI(self) + self.videos = self.config("videos", True) def items(self): - blog = self.api.blog_by_url("http://" + self.blog) blog["pages"] = blog["pages"]["totalItems"] blog["posts"] = blog["posts"]["totalItems"] diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 2e5ce4d4a6..fc6b197cdc 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -52,25 +52,6 @@ def __init__(self, match): self._cfgpath = ("extractor", self.category, self.subcategory) self._parentdir = "" - self._write_pages = self.config("write-pages", False) - self._retry_codes = self.config("retry-codes") - self._retries = self.config("retries", 4) - self._timeout = self.config("timeout", 30) - self._verify = self.config("verify", True) - self._proxies = util.build_proxy_map(self.config("proxy"), self.log) - self._interval = util.build_duration_func( - self.config("sleep-request", self.request_interval), - self.request_interval_min, - ) - - if self._retries < 0: - self._retries = float("inf") - if not self._retry_codes: - self._retry_codes = () - - self._init_session() - self._init_cookies() - @classmethod def from_url(cls, url): if isinstance(cls.pattern, str): @@ -79,8 +60,16 @@ def from_url(cls, url): return cls(match) if match else None def __iter__(self): + self.initialize() return self.items() + def initialize(self): + self._init_options() + self._init_session() + self._init_cookies() + self._init() + self.initialize = util.noop + def items(self): yield Message.Version, 1 @@ -245,6 +234,26 @@ def _get_auth_info(self): return username, password + def _init(self): + pass + + def _init_options(self): + self._write_pages = self.config("write-pages", False) + self._retry_codes = self.config("retry-codes") + self._retries = self.config("retries", 4) + self._timeout = self.config("timeout", 30) + self._verify = self.config("verify", True) + self._proxies = util.build_proxy_map(self.config("proxy"), self.log) + self._interval = util.build_duration_func( + self.config("sleep-request", self.request_interval), + self.request_interval_min, + ) + + if self._retries < 0: + self._retries = float("inf") + if not self._retry_codes: + self._retry_codes = () + def _init_session(self): self.session = session = requests.Session() headers = session.headers @@ -454,6 +463,13 @@ def _prepare_ddosguard_cookies(self): self.cookies.set( "__ddg2", util.generate_token(), domain=self.cookies_domain) + def _cache(self, func, maxage, keyarg=None): + # return cache.DatabaseCacheDecorator(func, maxage, keyarg) + return cache.DatabaseCacheDecorator(func, keyarg, maxage) + + def _cache_memory(self, func, maxage=None, keyarg=None): + return cache.Memcache() + def _get_date_min_max(self, dmin=None, dmax=None): """Retrieve and parse 'date-min' and 'date-max' config values""" def get(key, default): @@ -654,6 +670,8 @@ class AsynchronousMixin(): """Run info extraction in a separate thread""" def __iter__(self): + self.initialize() + messages = queue.Queue(5) thread = threading.Thread( target=self.async_items, diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 19a3aeff18..b16d27a2a3 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -22,8 +22,7 @@ class DanbooruExtractor(BaseExtractor): per_page = 200 request_interval = 1.0 - def __init__(self, match): - BaseExtractor.__init__(self, match) + def _init(self): self.ugoira = self.config("ugoira", False) self.external = self.config("external", False) self.includes = False diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 9f16b3345c..3497b0c4e6 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -38,14 +38,18 @@ class DeviantartExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) + self.user = match.group(1) or match.group(2) + + def _init(self): self.flat = self.config("flat", True) self.extra = self.config("extra", False) self.original = self.config("original", True) self.comments = self.config("comments", False) - self.user = match.group(1) or match.group(2) + + self.api = DeviantartOAuthAPI(self) self.group = False self.offset = 0 - self.api = None + self._premium_cache = {} unwatch = self.config("auto-unwatch") if unwatch: @@ -60,11 +64,13 @@ def __init__(self, match): self._update_content = self._update_content_image self.original = True - self._premium_cache = {} - self.commit_journal = { - "html": self._commit_journal_html, - "text": self._commit_journal_text, - }.get(self.config("journals", "html")) + journals = self.config("journals", "html") + if journals == "html": + self.commit_journal = self._commit_journal_html + elif journals == "text": + self.commit_journal = self._commit_journal_text + else: + self.commit_journal = None def skip(self, num): self.offset += num @@ -80,8 +86,6 @@ def login(self): return True def items(self): - self.api = DeviantartOAuthAPI(self) - if self.user and self.config("group", True): profile = self.api.user_profile(self.user) self.group = not profile @@ -449,6 +453,9 @@ class DeviantartUserExtractor(DeviantartExtractor): ("https://shimoda7.deviantart.com/"), ) + def initialize(self): + pass + def items(self): base = "{}/{}/".format(self.root, self.user) return self._dispatch_extractors(( diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 087ff51cc2..d5f1d02b4d 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -31,17 +31,21 @@ class ExhentaiExtractor(Extractor): LIMIT = False def __init__(self, match): - # allow calling 'self.config()' before 'Extractor.__init__()' - self._cfgpath = ("extractor", self.category, self.subcategory) + Extractor.__init__(self, match) + self.version = match.group(1) - version = match.group(1) + def initialize(self): domain = self.config("domain", "auto") if domain == "auto": - domain = ("ex" if version == "ex" else "e-") + "hentai.org" + domain = ("ex" if self.version == "ex" else "e-") + "hentai.org" self.root = "https://" + domain self.cookies_domain = "." + domain - Extractor.__init__(self, match) + Extractor.initialize(self) + + if self.version != "ex": + self.cookies.set("nw", "1", domain=self.cookies_domain) + self.session.headers["Referer"] = self.root + "/" self.original = self.config("original", True) limits = self.config("limits", False) @@ -51,10 +55,6 @@ def __init__(self, match): else: self.limits = False - self.session.headers["Referer"] = self.root + "/" - if version != "ex": - self.cookies.set("nw", "1", domain=self.cookies_domain) - def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) if response.history and response.headers.get("Content-Length") == "0": @@ -174,6 +174,7 @@ def __init__(self, match): self.image_token = match.group(4) self.image_num = text.parse_int(match.group(6), 1) + def _init(self): source = self.config("source") if source == "hitomi": self.items = self._items_hitomi diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 40ad8cdd95..921ddb62dd 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -6,9 +6,9 @@ """Extractors for https://www.fanbox.cc/""" -import re from .common import Extractor, Message from .. import text +import re BASE_PATTERN = ( @@ -27,8 +27,7 @@ class FanboxExtractor(Extractor): archive_fmt = "{id}_{num}" _warning = True - def __init__(self, match): - Extractor.__init__(self, match) + def _init(self): self.embeds = self.config("embeds", True) def items(self): diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index cb7d1e8123..3b18c63ecc 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -26,8 +26,10 @@ class FlickrExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.api = FlickrAPI(self) self.item_id = match.group(1) + + def _init(self): + self.api = FlickrAPI(self) self.user = None def items(self): diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 76fb69ebb4..fefb2c4c6f 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -22,10 +22,12 @@ class FoolfuukaExtractor(BaseExtractor): def __init__(self, match): BaseExtractor.__init__(self, match) - self.session.headers["Referer"] = self.root if self.category == "b4k": self.remote = self._remote_direct + def _init(self): + self.session.headers["Referer"] = self.root + "/" + def items(self): yield Message.Directory, self.metadata() for post in self.posts(): diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index c03c89b2ef..8c3ef79d6b 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -28,6 +28,8 @@ class FuraffinityExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) + + def _init(self): self.offset = 0 if self.config("descriptions") == "html": @@ -384,6 +386,9 @@ class FuraffinityUserExtractor(FuraffinityExtractor): }), ) + def initialize(self): + pass + def items(self): base = "{}/{{}}/{}/".format(self.root, self.user) return self._dispatch_extractors(( diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 958c4b5875..1ef78efd65 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -19,8 +19,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): basecategory = "gelbooru_v02" - def __init__(self, match): - booru.BooruExtractor.__init__(self, match) + def _init(self): self.api_key = self.config("api-key") self.user_id = self.config("user-id") diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py index ccebdf9886..53ef118056 100644 --- a/gallery_dl/extractor/gfycat.py +++ b/gallery_dl/extractor/gfycat.py @@ -24,6 +24,7 @@ def __init__(self, match): Extractor.__init__(self, match) self.key = match.group(1).lower() + def _init(self): formats = self.config("format") if formats is None: formats = ("mp4", "webm", "mobile", "gif") diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py index 593a846451..ac03923ffd 100644 --- a/gallery_dl/extractor/hentaicosplays.py +++ b/gallery_dl/extractor/hentaicosplays.py @@ -57,7 +57,9 @@ def __init__(self, match): self.root = text.ensure_http_scheme(root) url = "{}/story/{}/".format(self.root, self.slug) GalleryExtractor.__init__(self, match, url) - self.session.headers["Referer"] = url + + def _init(self): + self.session.headers["Referer"] = self.gallery_url def metadata(self, page): title = text.extr(page, "<title>", "") diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 78a576df84..56ea1d4d31 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -170,6 +170,9 @@ class HentaifoundryUserExtractor(HentaifoundryExtractor): pattern = BASE_PATTERN + r"/user/([^/?#]+)/profile" test = ("https://www.hentai-foundry.com/user/Tenpura/profile",) + def initialize(self): + pass + def items(self): root = self.root user = "/user/" + self.user diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 4e8d1caee1..c012c665e5 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -66,12 +66,13 @@ class HitomiGalleryExtractor(GalleryExtractor): ) def __init__(self, match): - gid = match.group(1) - url = "https://ltn.hitomi.la/galleries/{}.js".format(gid) + self.gid = match.group(1) + url = "https://ltn.hitomi.la/galleries/{}.js".format(self.gid) GalleryExtractor.__init__(self, match, url) - self.info = None + + def _init(self): self.session.headers["Referer"] = "{}/reader/{}.html".format( - self.root, gid) + self.root, self.gid) def metadata(self, page): self.info = info = util.json_loads(page.partition("=")[2]) diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index 30158b433d..2ab9f3ccbf 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -21,9 +21,8 @@ class HotleakExtractor(Extractor): archive_fmt = "{type}_{creator}_{id}" root = "https://hotleak.vip" - def __init__(self, match): - Extractor.__init__(self, match) - self.session.headers["Referer"] = self.root + def _init(self): + self.session.headers["Referer"] = self.root + "/" def items(self): for post in self.posts(): diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index 02f037dd85..fcac7fe279 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -29,6 +29,8 @@ def __init__(self, match): self.logged_in = True self.start_page = 1 self.start_post = 0 + + def _init(self): self.extags = self.config("tags", False) def items(self): diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index 67d0b11052..9a3ea3689a 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2022 Mike Fährmann +# Copyright 2014-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -21,6 +21,8 @@ class ImagebamExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.path = match.group(1) + + def _init(self): self.cookies.set("nsfw_inter", "1", domain="www.imagebam.com") def _parse_image_page(self, path): diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py index 9229617bed..2babead87c 100644 --- a/gallery_dl/extractor/imagechest.py +++ b/gallery_dl/extractor/imagechest.py @@ -47,8 +47,10 @@ def __init__(self, match): url = self.root + "/p/" + self.gallery_id GalleryExtractor.__init__(self, match, url) - self.access_token = self.config("access-token") - if self.access_token: + def _init(self): + access_token = self.config("access-token") + if access_token: + self.api = ImagechestAPI(self, access_token) self.gallery_url = None self.metadata = self._metadata_api self.images = self._images_api @@ -82,8 +84,7 @@ def images(self, page): ] def _metadata_api(self, page): - api = ImagechestAPI(self, self.access_token) - post = api.post(self.gallery_id) + post = self.api.post(self.gallery_id) post["date"] = text.parse_datetime( post["created"], "%Y-%m-%dT%H:%M:%S.%fZ") diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index f5b69faa2e..43ac3a353a 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -23,9 +23,8 @@ class ImagefapExtractor(Extractor): archive_fmt = "{gallery_id}_{image_id}" request_interval = (2.0, 4.0) - def __init__(self, match): - Extractor.__init__(self, match) - self.session.headers["Referer"] = self.root + def _init(self): + self.session.headers["Referer"] = self.root + "/" def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 4c29d98ff1..ca9671c3b3 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -22,8 +22,10 @@ class ImgurExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.api = ImgurAPI(self) self.key = match.group(1) + + def _init(self): + self.api = ImgurAPI(self) self.mp4 = self.config("mp4", True) def _prepare(self, image): diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 83a1a194ae..c6df16b0b2 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -24,8 +24,7 @@ class InkbunnyExtractor(Extractor): archive_fmt = "{file_id}" root = "https://inkbunny.net" - def __init__(self, match): - Extractor.__init__(self, match) + def _init(self): self.api = InkbunnyAPI(self) def items(self): diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 29208aef07..cb77fa1c7f 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -34,16 +34,8 @@ class InstagramExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.item = match.group(1) - self.api = None - self.www_claim = "0" - self.csrf_token = util.generate_token() - self._logged_in = True - self._find_tags = re.compile(r"#\w+").findall - self._cursor = None - self._user = None - def items(self): - self.login() + def _init(self): self.cookies.set( "csrftoken", self.csrf_token, domain=self.cookies_domain) @@ -52,6 +44,16 @@ def items(self): else: self.api = InstagramRestAPI(self) + self.www_claim = "0" + self.csrf_token = util.generate_token() + self._find_tags = re.compile(r"#\w+").findall + self._logged_in = True + self._cursor = None + self._user = None + + def items(self): + self.login() + data = self.metadata() videos = self.config("videos", True) previews = self.config("previews", False) @@ -400,6 +402,9 @@ class InstagramUserExtractor(InstagramExtractor): ("https://www.instagram.com/id:25025320/"), ) + def initialize(self): + pass + def items(self): base = "{}/{}/".format(self.root, self.item) stories = "{}/stories/{}/".format(self.root, self.item) diff --git a/gallery_dl/extractor/itaku.py b/gallery_dl/extractor/itaku.py index 4bcedae196..356a002b90 100644 --- a/gallery_dl/extractor/itaku.py +++ b/gallery_dl/extractor/itaku.py @@ -26,8 +26,10 @@ class ItakuExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.api = ItakuAPI(self) self.item = match.group(1) + + def _init(self): + self.api = ItakuAPI(self) self.videos = self.config("videos", True) def items(self): diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index d5d02c2934..2ed73e9cc2 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -35,14 +35,15 @@ def __init__(self, match): self.root = text.root_from_url(match.group(0)) self.cookies_domain = ".{}.{}".format(domain, tld) Extractor.__init__(self, match) - self.session.headers["Referer"] = self.root + "/" - def items(self): + def _init(self): + self.session.headers["Referer"] = self.root + "/" self._prepare_ddosguard_cookies() - self._find_inline = re.compile( r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+' r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall + + def items(self): find_hash = re.compile(HASH_PATTERN).match generators = self._build_file_generators(self.config("files")) duplicates = self.config("duplicates") diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index 5d236c37f6..9cebe3aef2 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -46,9 +46,10 @@ def __init__(self, match): LolisafeExtractor.__init__(self, match) self.album_id = match.group(match.lastindex) + def _init(self): domain = self.config("domain") if domain == "auto": - self.root = text.root_from_url(match.group(0)) + self.root = text.root_from_url(self.url) elif domain: self.root = text.ensure_http_scheme(domain) diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index 80f8758c17..dcf09d163c 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -118,6 +118,8 @@ class LusciousAlbumExtractor(LusciousExtractor): def __init__(self, match): LusciousExtractor.__init__(self, match) self.album_id = match.group(1) + + def _init(self): self.gif = self.config("gif", False) def items(self): diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index e111fee353..b0c985de56 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -30,9 +30,11 @@ class MangadexExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) + self.uuid = match.group(1) + + def _init(self): self.session.headers["User-Agent"] = util.USERAGENT self.api = MangadexAPI(self) - self.uuid = match.group(1) def items(self): for chapter in self.chapters(): @@ -202,7 +204,7 @@ def __init__(self, extr): self.extractor = extr self.headers = {} - self.username, self.password = self.extractor._get_auth_info() + self.username, self.password = extr._get_auth_info() if not self.username: self.authenticate = util.noop diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py index 0818fd90ab..8478b8de00 100644 --- a/gallery_dl/extractor/mangafox.py +++ b/gallery_dl/extractor/mangafox.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2022 Mike Fährmann +# Copyright 2017-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -33,6 +33,8 @@ def __init__(self, match): base, self.cstr, self.volume, self.chapter, self.minor = match.groups() self.urlbase = self.root + base ChapterExtractor.__init__(self, match, self.urlbase + "/1.html") + + def _init(self): self.session.headers["Referer"] = self.root + "/" def metadata(self, page): diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py index ccce09b4e7..97c26d473a 100644 --- a/gallery_dl/extractor/mangahere.py +++ b/gallery_dl/extractor/mangahere.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2022 Mike Fährmann +# Copyright 2015-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -42,6 +42,8 @@ def __init__(self, match): self.part, self.volume, self.chapter = match.groups() url = self.url_fmt.format(self.part, 1) ChapterExtractor.__init__(self, match, url) + + def _init(self): self.session.headers["Referer"] = self.root_mobile + "/" def metadata(self, page): diff --git a/gallery_dl/extractor/mangakakalot.py b/gallery_dl/extractor/mangakakalot.py index ba55ac162a..e397586e3b 100644 --- a/gallery_dl/extractor/mangakakalot.py +++ b/gallery_dl/extractor/mangakakalot.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Copyright 2020 Jake Mannens -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -39,7 +39,9 @@ class MangakakalotChapterExtractor(MangakakalotBase, ChapterExtractor): def __init__(self, match): self.path = match.group(1) ChapterExtractor.__init__(self, match, self.root + self.path) - self.session.headers['Referer'] = self.root + + def _init(self): + self.session.headers['Referer'] = self.root + "/" def metadata(self, page): _ , pos = text.extract(page, '', '<') diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py index 6fd9f495e2..807bc5ee79 100644 --- a/gallery_dl/extractor/manganelo.py +++ b/gallery_dl/extractor/manganelo.py @@ -21,7 +21,9 @@ class ManganeloBase(): def __init__(self, match): domain, path = match.groups() super().__init__(match, "https://" + domain + path) - self.session.headers['Referer'] = self.root + + def _init(self): + self.session.headers['Referer'] = self.root + "/" if self._match_chapter is None: ManganeloBase._match_chapter = re.compile( diff --git a/gallery_dl/extractor/mangasee.py b/gallery_dl/extractor/mangasee.py index dfa9bdf010..00c89c1e63 100644 --- a/gallery_dl/extractor/mangasee.py +++ b/gallery_dl/extractor/mangasee.py @@ -90,6 +90,8 @@ def __init__(self, match): self.category = "mangalife" self.root = "https://manga4life.com" ChapterExtractor.__init__(self, match, self.root + match.group(2)) + + def _init(self): self.session.headers["Referer"] = self.gallery_url domain = self.root.rpartition("/")[2] diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index ddd34f0d06..3bed955c1c 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -23,8 +23,10 @@ class MastodonExtractor(BaseExtractor): def __init__(self, match): BaseExtractor.__init__(self, match) - self.instance = self.root.partition("://")[2] self.item = match.group(match.lastindex) + + def _init(self): + self.instance = self.root.partition("://")[2] self.reblogs = self.config("reblogs", False) self.replies = self.config("replies", True) diff --git a/gallery_dl/extractor/misskey.py b/gallery_dl/extractor/misskey.py index 37efac070f..8c717581f6 100644 --- a/gallery_dl/extractor/misskey.py +++ b/gallery_dl/extractor/misskey.py @@ -19,9 +19,11 @@ class MisskeyExtractor(BaseExtractor): def __init__(self, match): BaseExtractor.__init__(self, match) + self.item = match.group(match.lastindex) + + def _init(self): self.api = MisskeyAPI(self) self.instance = self.root.rpartition("://")[2] - self.item = match.group(match.lastindex) self.renotes = self.config("renotes", False) self.replies = self.config("replies", True) diff --git a/gallery_dl/extractor/myhentaigallery.py b/gallery_dl/extractor/myhentaigallery.py index 5dc4cb60b1..3301da979e 100644 --- a/gallery_dl/extractor/myhentaigallery.py +++ b/gallery_dl/extractor/myhentaigallery.py @@ -38,7 +38,9 @@ def __init__(self, match): self.gallery_id = match.group(1) url = "{}/gallery/thumbnails/{}".format(self.root, self.gallery_id) GalleryExtractor.__init__(self, match, url) - self.session.headers["Referer"] = url + + def _init(self): + self.session.headers["Referer"] = self.gallery_url def metadata(self, page): extr = text.extract_from(page) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index e3ea3fc9fe..8a25528076 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -29,6 +29,8 @@ def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) self.user_root = "https://{}.newgrounds.com".format(self.user) + + def _init(self): self.flash = self.config("flash", True) fmt = self.config("format", "original") @@ -517,6 +519,9 @@ class NewgroundsUserExtractor(NewgroundsExtractor): }), ) + def initialize(self): + pass + def items(self): base = self.user_root + "/" return self._dispatch_extractors(( diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index e822895b10..66040d8a1e 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -21,18 +21,19 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): archive_fmt = "{image_id}_{num}" def __init__(self, match): - self._init_category(match) + BaseExtractor.__init__(self, match) + self.user_id = text.parse_int(match.group(match.lastindex)) + + def initialize(self): self.cookies_domain = "." + self.root.rpartition("/")[2] self.cookies_names = (self.category + "_tok",) - if self.category == "horne": - self._extract_data = self._extract_data_horne - - BaseExtractor.__init__(self, match) + BaseExtractor.initialize(self) - self.user_id = text.parse_int(match.group(match.lastindex)) - self.user_name = None self.session.headers["Referer"] = self.root + "/" + self.user_name = None + if self.category == "horne": + self._extract_data = self._extract_data_horne def items(self): self.login() @@ -180,6 +181,9 @@ class NijieUserExtractor(NijieExtractor): ("https://horne.red/members.php?id=58000"), ) + def initialize(self): + pass + def items(self): fmt = "{}/{{}}.php?id={}".format(self.root, self.user_id).format return self._dispatch_extractors(( diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 824757ce9b..f109d25828 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -28,6 +28,8 @@ class OAuthBase(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.client = None + + def _init(self): self.cache = config.get(("extractor", self.category), "cache", True) def oauth_config(self, key, default=None): diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 7bccf8386c..6bc7b9a9a3 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -108,6 +108,7 @@ def __init__(self, match): PahealExtractor.__init__(self, match) self.tags = text.unquote(match.group(1)) + def _init(self): if self.config("metadata"): self._extract_data = self._extract_data_ex diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index e7188285cb..8fa5de24f0 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -22,8 +22,7 @@ class PhilomenaExtractor(BooruExtractor): page_start = 1 per_page = 50 - def __init__(self, match): - BooruExtractor.__init__(self, match) + def _init(self): self.api = PhilomenaAPI(self) _file_url = operator.itemgetter("view_url") diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py index 6234e6a957..22aff88460 100644 --- a/gallery_dl/extractor/photobucket.py +++ b/gallery_dl/extractor/photobucket.py @@ -48,9 +48,10 @@ class PhotobucketAlbumExtractor(Extractor): ) def __init__(self, match): - Extractor.__init__(self, match) - self.album_path = "" self.root = "https://" + match.group(1) + Extractor.__init__(self, match) + + def _init(self): self.session.headers["Referer"] = self.url def items(self): @@ -129,6 +130,8 @@ def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) or match.group(3) self.media_id = match.group(2) + + def _init(self): self.session.headers["Referer"] = self.url def items(self): diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 92e0588657..be30705bc2 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -23,12 +23,10 @@ class PinterestExtractor(Extractor): archive_fmt = "{id}{media_id}" root = "https://www.pinterest.com" - def __init__(self, match): - Extractor.__init__(self, match) - + def _init(self): domain = self.config("domain") if not domain or domain == "auto" : - self.root = text.root_from_url(match.group(0)) + self.root = text.root_from_url(self.url) else: self.root = text.ensure_http_scheme(domain) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 3cc59acfff..ffe8030f72 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -28,8 +28,7 @@ class PixivExtractor(Extractor): archive_fmt = "{id}{suffix}.{extension}" cookies_domain = None - def __init__(self, match): - Extractor.__init__(self, match) + def _init(self): self.api = PixivAppAPI(self) self.load_ugoira = self.config("ugoira", True) self.max_posts = self.config("max-posts", 0) @@ -174,6 +173,9 @@ def __init__(self, match): PixivExtractor.__init__(self, match) self.user_id = match.group(1) + def initialize(self): + pass + def items(self): base = "{}/users/{}/".format(self.root, self.user_id) return self._dispatch_extractors(( diff --git a/gallery_dl/extractor/pornpics.py b/gallery_dl/extractor/pornpics.py index 783f3da954..929e0f58bb 100644 --- a/gallery_dl/extractor/pornpics.py +++ b/gallery_dl/extractor/pornpics.py @@ -23,7 +23,9 @@ class PornpicsExtractor(Extractor): def __init__(self, match): super().__init__(match) self.item = match.group(1) - self.session.headers["Referer"] = self.root + + def _init(self): + self.session.headers["Referer"] = self.root + "/" def items(self): for gallery in self.galleries(): diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index 1800b68d1c..ba571bbd3e 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -22,18 +22,21 @@ class ReactorExtractor(BaseExtractor): def __init__(self, match): BaseExtractor.__init__(self, match) + url = text.ensure_http_scheme(match.group(0), "http://") pos = url.index("/", 10) - - self.root, self.path = url[:pos], url[pos:] - self.session.headers["Referer"] = self.root - self.gif = self.config("gif", False) + self.root = url[:pos] + self.path = url[pos:] if self.category == "reactor": # set category based on domain name netloc = urllib.parse.urlsplit(self.root).netloc self.category = netloc.rpartition(".")[0] + def _init(self): + self.session.headers["Referer"] = self.root + self.gif = self.config("gif", False) + def items(self): data = self.metadata() yield Message.Directory, data diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index c924e0a3cb..c68068cb8f 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -57,8 +57,10 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): def __init__(self, match): ChapterExtractor.__init__(self, match) + self.params = match.group(2) - params = text.parse_query(match.group(2)) + def _init(self): + params = text.parse_query(self.params) quality = self.config("quality") if quality is None or quality == "auto": diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 9109e8dc31..abd21b300e 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -24,6 +24,8 @@ class RedgifsExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.key = match.group(1) + + def _init(self): self.api = RedgifsAPI(self) formats = self.config("format") diff --git a/gallery_dl/extractor/rule34us.py b/gallery_dl/extractor/rule34us.py index 00b6972d2e..88331eaaf0 100644 --- a/gallery_dl/extractor/rule34us.py +++ b/gallery_dl/extractor/rule34us.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -19,8 +19,7 @@ class Rule34usExtractor(BooruExtractor): root = "https://rule34.us" per_page = 42 - def __init__(self, match): - BooruExtractor.__init__(self, match) + def _init(self): self._find_tags = re.compile( r'
  • ]*>", "") diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py index b0dd9bbdea..de6c8a15cf 100644 --- a/gallery_dl/extractor/shimmie2.py +++ b/gallery_dl/extractor/shimmie2.py @@ -18,21 +18,20 @@ class Shimmie2Extractor(BaseExtractor): filename_fmt = "{category}_{id}{md5:?_//}.{extension}" archive_fmt = "{id}" - def __init__(self, match): - BaseExtractor.__init__(self, match) - + def _init(self): try: instance = INSTANCES[self.category] except KeyError: - pass - else: - cookies = instance.get("cookies") - if cookies: - domain = self.root.rpartition("/")[2] - self.cookies_update_dict(cookies, domain=domain) - file_url = instance.get("file_url") - if file_url: - self.file_url_fmt = file_url + return + + cookies = instance.get("cookies") + if cookies: + domain = self.root.rpartition("/")[2] + self.cookies_update_dict(cookies, domain=domain) + + file_url = instance.get("file_url") + if file_url: + self.file_url_fmt = file_url def items(self): data = self.metadata() diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py index b5d116fd30..d1ccc49275 100644 --- a/gallery_dl/extractor/simplyhentai.py +++ b/gallery_dl/extractor/simplyhentai.py @@ -40,7 +40,9 @@ def __init__(self, match): path = "/" + subdomain.rstrip(".") + path url = "https://old.simply-hentai.com" + path GalleryExtractor.__init__(self, match, url) - self.session.headers["Referer"] = url + + def _init(self): + self.session.headers["Referer"] = self.gallery_url def metadata(self, page): extr = text.extract_from(page) diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index 3724c8592e..b643c6f223 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -22,6 +22,8 @@ class SkebExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.user_name = match.group(1) + + def _init(self): self.thumbnails = self.config("thumbnails", False) self.article = self.config("article", False) diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index e30c491156..b9edd4ab41 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -34,8 +34,7 @@ class SmugmugExtractor(Extractor): "Uris": None, } - def __init__(self, match): - Extractor.__init__(self, match) + def _init(self): self.api = SmugmugAPI(self) self.videos = self.config("videos", True) self.session = self.api.session diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py index 4b15b1440a..8c816ad189 100644 --- a/gallery_dl/extractor/szurubooru.py +++ b/gallery_dl/extractor/szurubooru.py @@ -20,8 +20,7 @@ class SzurubooruExtractor(booru.BooruExtractor): filename_fmt = "{id}_{version}_{checksumMD5}.{extension}" per_page = 100 - def __init__(self, match): - booru.BooruExtractor.__init__(self, match) + def _init(self): self.headers = { "Accept": "application/json", "Content-Type": "application/json", diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py index ec4a249c33..0e09e22aee 100644 --- a/gallery_dl/extractor/tapas.py +++ b/gallery_dl/extractor/tapas.py @@ -26,8 +26,7 @@ class TapasExtractor(Extractor): cookies_names = ("_cpc_",) _cache = None - def __init__(self, match): - Extractor.__init__(self, match) + def _init(self): if self._cache is None: TapasExtractor._cache = {} diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index f42da48865..12ea39f8c5 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -42,6 +42,7 @@ def __init__(self, match): else: self.blog = match.group(1) or match.group(3) + def _init(self): self.api = TumblrAPI(self) self.types = self._setup_posttypes() self.avatar = self.config("avatar", False) diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py index a8acd319ca..c3e0a2620c 100644 --- a/gallery_dl/extractor/twibooru.py +++ b/gallery_dl/extractor/twibooru.py @@ -26,8 +26,7 @@ class TwibooruExtractor(BooruExtractor): per_page = 50 root = "https://twibooru.org" - def __init__(self, match): - BooruExtractor.__init__(self, match) + def _init(self): self.api = TwibooruAPI(self) _file_url = operator.itemgetter("view_url") diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 7e42079974..478b6d3879 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -32,6 +32,8 @@ class TwitterExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) + + def _init(self): self.textonly = self.config("text-tweets", False) self.retweets = self.config("retweets", False) self.replies = self.config("replies", True) @@ -490,6 +492,9 @@ def __init__(self, match): if user_id: self.user = "id:" + user_id + def initialize(self): + pass + def items(self): base = "{}/{}/".format(self.root, self.user) return self._dispatch_extractors(( diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py index 972b508db1..4b49a638c7 100644 --- a/gallery_dl/extractor/urlshortener.py +++ b/gallery_dl/extractor/urlshortener.py @@ -54,6 +54,7 @@ def __init__(self, match): UrlshortenerExtractor.__init__(self, match) self.id = match.group(match.lastindex) + def _init(self): try: self.headers = INSTANCES[self.category]["headers"] except Exception: diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index d8aa6cdb6d..084f9b2526 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -26,9 +26,8 @@ class VipergirlsExtractor(Extractor): cookies_domain = ".vipergirls.to" cookies_names = ("vg_userid", "vg_password") - def __init__(self, match): - Extractor.__init__(self, match) - self.session.headers["Referer"] = self.root + def _init(self): + self.session.headers["Referer"] = self.root + "/" def items(self): self.login() diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py index 0ba0d910d8..9e2710987f 100644 --- a/gallery_dl/extractor/wallhaven.py +++ b/gallery_dl/extractor/wallhaven.py @@ -20,8 +20,7 @@ class WallhavenExtractor(Extractor): archive_fmt = "{id}" request_interval = 1.4 - def __init__(self, match): - Extractor.__init__(self, match) + def _init(self): self.api = WallhavenAPI(self) def items(self): @@ -109,6 +108,9 @@ def __init__(self, match): WallhavenExtractor.__init__(self, match) self.username = match.group(1) + def initialize(self): + pass + def items(self): base = "{}/user/{}/".format(self.root, self.username) return self._dispatch_extractors(( diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py index eca4f1adf5..c4d242a166 100644 --- a/gallery_dl/extractor/weasyl.py +++ b/gallery_dl/extractor/weasyl.py @@ -30,8 +30,7 @@ def populate_submission(data): return True return False - def __init__(self, match): - Extractor.__init__(self, match) + def _init(self): self.session.headers['X-Weasyl-API-Key'] = self.config("api-key") def request_submission(self, submitid): diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 7b3e8033f2..315924209f 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Copyright 2020 Leonardo Taccari -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -71,15 +71,18 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): ) def __init__(self, match): - self.path, self.lang, self.genre, self.comic, query = match.groups() + self.path, self.lang, self.genre, self.comic, self.query = \ + match.groups() - url = "{}/{}/viewer?{}".format(self.root, self.path, query) + url = "{}/{}/viewer?{}".format(self.root, self.path, self.query) GalleryExtractor.__init__(self, match, url) + + def _init(self): self.setup_agegate_cookies() - query = text.parse_query(query) - self.title_no = query.get("title_no") - self.episode_no = query.get("episode_no") + params = text.parse_query(self.query) + self.title_no = params.get("title_no") + self.episode_no = params.get("episode_no") def metadata(self, page): keywords, pos = text.extract( @@ -141,12 +144,15 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor): def __init__(self, match): Extractor.__init__(self, match) + self.path, self.lang, self.genre, self.comic, self.query = \ + match.groups() + + def _init(self): self.setup_agegate_cookies() - self.path, self.lang, self.genre, self.comic, query = match.groups() - query = text.parse_query(query) - self.title_no = query.get("title_no") - self.page_no = text.parse_int(query.get("page"), 1) + params = text.parse_query(self.query) + self.title_no = params.get("title_no") + self.page_no = text.parse_int(params.get("page"), 1) def items(self): page = None diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 2de7a2fc2a..ae0fc4e39e 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -28,6 +28,8 @@ class WeiboExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self._prefix, self.user = match.groups() + + def _init(self): self.retweets = self.config("retweets", True) self.videos = self.config("videos", True) self.livephoto = self.config("livephoto", True) @@ -228,6 +230,9 @@ class WeiboUserExtractor(WeiboExtractor): ("https://www.weibo.com/p/1003062314621010/home"), ) + def initialize(self): + pass + def items(self): base = "{}/u/{}?tabtype=".format(self.root, self._user_id()) return self._dispatch_extractors(( diff --git a/test/test_cookies.py b/test/test_cookies.py index 5a4fbe65c0..a6ad05f1a1 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -46,8 +46,7 @@ def tearDownClass(cls): def test_cookiefile(self): config.set((), "cookies", self.cookiefile) - - cookies = extractor.find("test:").cookies + cookies = _get_extractor("test").cookies self.assertEqual(len(cookies), 1) cookie = next(iter(cookies)) @@ -65,12 +64,14 @@ def test_invalid_filename(self): def _test_warning(self, filename, exc): config.set((), "cookies", filename) log = logging.getLogger("test") + with mock.patch.object(log, "warning") as mock_warning: - cookies = extractor.find("test:").cookies - self.assertEqual(len(cookies), 0) - self.assertEqual(mock_warning.call_count, 1) - self.assertEqual(mock_warning.call_args[0][0], "cookies: %s") - self.assertIsInstance(mock_warning.call_args[0][1], exc) + cookies = _get_extractor("test").cookies + + self.assertEqual(len(cookies), 0) + self.assertEqual(mock_warning.call_count, 1) + self.assertEqual(mock_warning.call_args[0][0], "cookies: %s") + self.assertIsInstance(mock_warning.call_args[0][1], exc) class TestCookiedict(unittest.TestCase): @@ -83,7 +84,8 @@ def tearDown(self): config.clear() def test_dict(self): - cookies = extractor.find("test:").cookies + cookies = _get_extractor("test").cookies + self.assertEqual(len(cookies), len(self.cdict)) self.assertEqual(sorted(cookies.keys()), sorted(self.cdict.keys())) self.assertEqual(sorted(cookies.values()), sorted(self.cdict.values())) @@ -122,7 +124,7 @@ def test_cookie_login(self): class TestCookieUtils(unittest.TestCase): def test_check_cookies(self): - extr = extractor.find("test:") + extr = _get_extractor("test") self.assertFalse(extr.cookies, "empty") self.assertFalse(extr.cookies_domain, "empty") @@ -144,7 +146,7 @@ def test_check_cookies(self): self.assertFalse(extr.cookies_check(("a", "b", "c"))) def test_check_cookies_domain(self): - extr = extractor.find("test:") + extr = _get_extractor("test") self.assertFalse(extr.cookies, "empty") extr.cookies_domain = ".example.org" @@ -166,7 +168,7 @@ def test_check_cookies_domain(self): self.assertTrue(extr.cookies_check(("a", "b", "c"))) def test_check_cookies_expires(self): - extr = extractor.find("test:") + extr = _get_extractor("test") self.assertFalse(extr.cookies, "empty") self.assertFalse(extr.cookies_domain, "empty") @@ -200,13 +202,18 @@ def test_check_cookies_expires(self): def _get_extractor(category): - URLS = { - "exhentai" : "https://exhentai.org/g/1200119/d55c44d3d0/", - "idolcomplex": "https://idol.sankakucomplex.com/post/show/1", - "nijie" : "https://nijie.info/view.php?id=1", - "horne" : "https://horne.red/view.php?id=1", - } - return extractor.find(URLS[category]) + extr = extractor.find(URLS[category]) + extr.initialize() + return extr + + +URLS = { + "exhentai" : "https://exhentai.org/g/1200119/d55c44d3d0/", + "idolcomplex": "https://idol.sankakucomplex.com/post/show/1", + "nijie" : "https://nijie.info/view.php?id=1", + "horne" : "https://horne.red/view.php?id=1", + "test" : "test:", +} if __name__ == "__main__": diff --git a/test/test_downloader.py b/test/test_downloader.py index c65be9528e..840e0780e9 100644 --- a/test/test_downloader.py +++ b/test/test_downloader.py @@ -34,6 +34,7 @@ class FakeJob(): def __init__(self): self.extractor = extractor.find("test:") + self.extractor.initialize() self.pathfmt = path.PathFormat(self.extractor) self.out = output.NullOutput() self.get_logger = logging.getLogger From 11f71a9cbab268bdbe3a827d32d881d855119a7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 25 Jul 2023 20:18:15 +0200 Subject: [PATCH 240/252] remove 'mememuseum' module This was forgotten when adding generic Shimmie2 support in 7865067d --- gallery_dl/extractor/mememuseum.py | 120 ----------------------------- 1 file changed, 120 deletions(-) delete mode 100644 gallery_dl/extractor/mememuseum.py diff --git a/gallery_dl/extractor/mememuseum.py b/gallery_dl/extractor/mememuseum.py deleted file mode 100644 index 1de0d768c4..0000000000 --- a/gallery_dl/extractor/mememuseum.py +++ /dev/null @@ -1,120 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2022 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://meme.museum/""" - -from .common import Extractor, Message -from .. import text - - -class MememuseumExtractor(Extractor): - """Base class for meme.museum extractors""" - basecategory = "booru" - category = "mememuseum" - filename_fmt = "{category}_{id}_{md5}.{extension}" - archive_fmt = "{id}" - root = "https://meme.museum" - - def items(self): - data = self.metadata() - - for post in self.posts(): - url = post["file_url"] - for key in ("id", "width", "height"): - post[key] = text.parse_int(post[key]) - post["tags"] = text.unquote(post["tags"]) - post.update(data) - yield Message.Directory, post - yield Message.Url, url, text.nameext_from_url(url, post) - - def metadata(self): - """Return general metadata""" - return () - - def posts(self): - """Return an iterable containing data of all relevant posts""" - return () - - -class MememuseumTagExtractor(MememuseumExtractor): - """Extractor for images from meme.museum by search-tags""" - subcategory = "tag" - directory_fmt = ("{category}", "{search_tags}") - pattern = r"(?:https?://)?meme\.museum/post/list/([^/?#]+)" - test = ("https://meme.museum/post/list/animated/1", { - "pattern": r"https://meme\.museum/_images/\w+/\d+%20-%20", - "count": ">= 30" - }) - per_page = 25 - - def __init__(self, match): - MememuseumExtractor.__init__(self, match) - self.tags = text.unquote(match.group(1)) - - def metadata(self): - return {"search_tags": self.tags} - - def posts(self): - pnum = 1 - while True: - url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) - extr = text.extract_from(self.request(url).text) - - while True: - mime = extr("data-mime='", "'") - if not mime: - break - - pid = extr("data-post-id='", "'") - tags, dimensions, size = extr("title='", "'").split(" // ") - md5 = extr("/_thumbs/", "/") - width, _, height = dimensions.partition("x") - - yield { - "file_url": "{}/_images/{}/{}%20-%20{}.{}".format( - self.root, md5, pid, text.quote(tags), - mime.rpartition("/")[2]), - "id": pid, "md5": md5, "tags": tags, - "width": width, "height": height, - "size": text.parse_bytes(size[:-1]), - } - - if not extr(">Next<", ">"): - return - pnum += 1 - - -class MememuseumPostExtractor(MememuseumExtractor): - """Extractor for single images from meme.museum""" - subcategory = "post" - pattern = r"(?:https?://)?meme\.museum/post/view/(\d+)" - test = ("https://meme.museum/post/view/10243", { - "pattern": r"https://meme\.museum/_images/105febebcd5ca791ee332adc4997" - r"1f78/10243%20-%20g%20beard%20open_source%20richard_stallm" - r"an%20stallman%20tagme%20text\.jpg", - "keyword": "3c8009251480cf17248c08b2b194dc0c4d59580e", - "content": "45565f3f141fc960a8ae1168b80e718a494c52d2", - }) - - def __init__(self, match): - MememuseumExtractor.__init__(self, match) - self.post_id = match.group(1) - - def posts(self): - url = "{}/post/view/{}".format(self.root, self.post_id) - extr = text.extract_from(self.request(url).text) - - return ({ - "id" : self.post_id, - "tags" : extr(": ", "<"), - "md5" : extr("/_thumbs/", "/"), - "file_url": self.root + extr("id='main_image' src='", "'"), - "width" : extr("data-width=", " ").strip("'\""), - "height" : extr("data-height=", " ").strip("'\""), - "size" : 0, - },) From c6d70c748325ebddbcaed80ef6acc21145de0811 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 26 Jul 2023 14:01:16 +0200 Subject: [PATCH 241/252] [docs] fix typo (#4350) --- docs/configuration.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 9bfc1e7304..0be6c19910 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1158,7 +1158,7 @@ Description Note: This requires 1 additional HTTP request per 200-post batch. -extractor.{Danbooru].threshold +extractor.[Danbooru].threshold ------------------------------ Type * ``string`` From 52053b58f0e527e18cf3e081f507202580544759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 26 Jul 2023 13:53:22 +0200 Subject: [PATCH 242/252] [lensdump] fix extraction (#4352) --- gallery_dl/extractor/lensdump.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index 89906215fc..43fc24e1e4 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -50,7 +50,7 @@ class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor): pattern = BASE_PATTERN + r"/(?:((?!\w+/albums|a/|i/)\w+)|a/(\w+))" test = ( ("https://lensdump.com/a/1IhJr", { - "url": "7428cc906e7b291c778d446a11c602b81ba72840", + "pattern": r"https://[abcd]\.l3n\.co/i/tq\w{4}\.png", "keyword": { "extension": "png", "name": str, @@ -77,6 +77,7 @@ def images(self, page): for node in self.nodes(page): # get urls and filenames of images in current page json_data = util.json_loads(text.unquote( + text.extr(node, "data-object='", "'") or text.extr(node, 'data-object="', '"'))) image_id = json_data.get('name') image_url = json_data.get('url') @@ -118,8 +119,7 @@ class LensdumpImageExtractor(LensdumpBase, Extractor): pattern = BASE_PATTERN + r"/i/(\w+)" test = ( ("https://lensdump.com/i/tyoAyM", { - "pattern": r"https://i\d\.lensdump\.com/i/tyoAyM\.webp", - "url": "ae9933f5f3bd9497bfc34e3e70a0fbef6c562d38", + "pattern": r"https://c\.l3n\.co/i/tyoAyM\.webp", "content": "1aa749ed2c0cf679ec8e1df60068edaf3875de46", "keyword": { "date": "dt:2022-08-01 08:24:28", @@ -128,7 +128,7 @@ class LensdumpImageExtractor(LensdumpBase, Extractor): "height": 400, "id": "tyoAyM", "title": "MYOBI clovis bookcaseset", - "url": "https://i2.lensdump.com/i/tyoAyM.webp", + "url": "https://c.l3n.co/i/tyoAyM.webp", "width": 620, }, }), From 1ece3b92ffdeab62b667c78d9c2c19485e0ce853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 26 Jul 2023 17:14:12 +0200 Subject: [PATCH 243/252] [mangadex] allow multiple values for 'lang' (#4093) This was already possible by setting 'lang' to a list of strings, but now it can also be done as a more command-line friendly string. -o lang=fr,it --- docs/configuration.rst | 9 ++++++--- gallery_dl/extractor/mangadex.py | 17 ++++++++++++++++- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 0be6c19910..cc2bbc9096 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2093,11 +2093,14 @@ Description extractor.mangadex.lang ----------------------- Type - ``string`` + * ``string`` + * ``list`` of ``strings`` Example - ``"en"`` + * ``"en"`` + * ``"fr,it"`` + * ``["fr", "it"]`` Description - `ISO 639-1 `__ language code + `ISO 639-1 `__ language codes to filter chapters by. diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index b0c985de56..e12e56b40d 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -150,6 +150,7 @@ class MangadexMangaExtractor(MangadexExtractor): pattern = BASE_PATTERN + r"/(?:title|manga)/(?!feed$)([0-9a-f-]+)" test = ( ("https://mangadex.org/title/f90c4398-8aad-4f51-8a1f-024ca09fdcbc", { + "count": ">= 5", "keyword": { "manga" : "Souten no Koumori", "manga_id": "f90c4398-8aad-4f51-8a1f-024ca09fdcbc", @@ -168,6 +169,16 @@ class MangadexMangaExtractor(MangadexExtractor): "Martial Arts", "Drama", "Tragedy"], }, }), + # mutliple values for 'lang' (#4093) + ("https://mangadex.org/title/f90c4398-8aad-4f51-8a1f-024ca09fdcbc", { + "options": (("lang", "fr,it"),), + "count": 2, + "keyword": { + "manga" : "Souten no Koumori", + "lang" : "re:fr|it", + "language": "re:French|Italian", + }, + }), ("https://mangadex.cc/manga/d0c88e3b-ea64-4e07-9841-c1d2ac982f4a/", { "options": (("lang", "en"),), "count": ">= 100", @@ -290,9 +301,13 @@ def _pagination(self, endpoint, params=None): if ratings is None: ratings = ("safe", "suggestive", "erotica", "pornographic") + lang = config("lang") + if isinstance(lang, str) and "," in lang: + lang = lang.split(",") + params["contentRating[]"] = ratings + params["translatedLanguage[]"] = lang params["includes[]"] = ("scanlation_group",) - params["translatedLanguage[]"] = config("lang") params["offset"] = 0 api_params = config("api-parameters") From 7fbc304ae9915b7c1c87ee85fc5e22194832affb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 26 Jul 2023 17:53:51 +0200 Subject: [PATCH 244/252] [twitter] fix crash on private user (#4349) --- gallery_dl/extractor/twitter.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 478b6d3879..a2ca9c1812 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -340,7 +340,11 @@ def _transform_tweet(self, tweet): return tdata def _transform_user(self, user): - uid = user.get("rest_id") or user["id_str"] + try: + uid = user.get("rest_id") or user["id_str"] + except KeyError: + # private/invalid user (#4349) + return {} try: return self._user_cache[uid] From e8299b459af925b89d7058914736221ccf7f77af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 26 Jul 2023 18:02:26 +0200 Subject: [PATCH 245/252] [moebooru] match search URLs with empty 'tags' (#4354) --- gallery_dl/extractor/moebooru.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py index 0ef0a32881..1e56bde9bd 100644 --- a/gallery_dl/extractor/moebooru.py +++ b/gallery_dl/extractor/moebooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2022 Mike Fährmann +# Copyright 2020-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -166,7 +166,7 @@ class MoebooruTagExtractor(MoebooruExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/post\?(?:[^&#]*&)*tags=([^&#]+)" + pattern = BASE_PATTERN + r"/post\?(?:[^&#]*&)*tags=([^&#]*)" test = ( ("https://yande.re/post?tags=ouzoku+armor", { "content": "59201811c728096b2d95ce6896fd0009235fe683", @@ -174,6 +174,8 @@ class MoebooruTagExtractor(MoebooruExtractor): ("https://konachan.com/post?tags=patata", { "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d", }), + # empty 'tags' (#4354) + ("https://konachan.com/post?tags="), ("https://konachan.net/post?tags=patata"), ("https://www.sakugabooru.com/post?tags=nichijou"), ("https://lolibooru.moe/post?tags=ruu_%28tksymkw%29"), From 62fce6a75ff6e28a7ccab9169e891d79a0c364ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 28 Jul 2023 14:18:47 +0200 Subject: [PATCH 246/252] [imagehosts] adjust variable names (#4358) prefix them with underscores to prevent a clash with the new 'self.cookies' from d97b8c2f --- gallery_dl/extractor/imagehosts.py | 48 +++++++++++++++--------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index a6e848ca9a..8a1bb3bd28 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -19,23 +19,23 @@ class ImagehostImageExtractor(Extractor): basecategory = "imagehost" subcategory = "image" archive_fmt = "{token}" - https = True - params = None - cookies = None - encoding = None + _https = True + _params = None + _cookies = None + _encoding = None def __init__(self, match): Extractor.__init__(self, match) self.page_url = "http{}://{}".format( - "s" if self.https else "", match.group(1)) + "s" if self._https else "", match.group(1)) self.token = match.group(2) - if self.params == "simple": - self.params = { + if self._params == "simple": + self._params = { "imgContinue": "Continue+to+image+...+", } - elif self.params == "complex": - self.params = { + elif self._params == "complex": + self._params = { "op": "view", "id": self.token, "pre": "1", @@ -46,16 +46,16 @@ def __init__(self, match): def items(self): page = self.request( self.page_url, - method=("POST" if self.params else "GET"), - data=self.params, - cookies=self.cookies, - encoding=self.encoding, + method=("POST" if self._params else "GET"), + data=self._params, + cookies=self._cookies, + encoding=self._encoding, ).text url, filename = self.get_info(page) data = text.nameext_from_url(filename, {"token": self.token}) data.update(self.metadata(page)) - if self.https and url.startswith("http:"): + if self._https and url.startswith("http:"): url = "https:" + url[5:] yield Message.Directory, data @@ -102,8 +102,8 @@ class ImxtoImageExtractor(ImagehostImageExtractor): "exception": exception.NotFoundError, }), ) - params = "simple" - encoding = "utf-8" + _params = "simple" + _encoding = "utf-8" def __init__(self, match): ImagehostImageExtractor.__init__(self, match) @@ -167,8 +167,8 @@ class AcidimgImageExtractor(ImagehostImageExtractor): "keyword": "135347ab4345002fc013863c0d9419ba32d98f78", "content": "0c8768055e4e20e7c7259608b67799171b691140", }) - params = "simple" - encoding = "utf-8" + _params = "simple" + _encoding = "utf-8" def get_info(self, page): url, pos = text.extract(page, ' Date: Fri, 28 Jul 2023 14:23:11 +0200 Subject: [PATCH 247/252] [acidimg] fix extraction swap ' and " again (2e309a13) and add a fallback in case this happens yet another time --- gallery_dl/extractor/imagehosts.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 8a1bb3bd28..8ef51b0abf 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -171,10 +171,16 @@ class AcidimgImageExtractor(ImagehostImageExtractor): _encoding = "utf-8" def get_info(self, page): - url, pos = text.extract(page, '', ' Date: Fri, 28 Jul 2023 14:25:37 +0200 Subject: [PATCH 248/252] [instagram] fix initialization order (#4359) regression caused by the changes in a383eca7 --- gallery_dl/extractor/instagram.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index cb77fa1c7f..561941cec3 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -36,6 +36,13 @@ def __init__(self, match): self.item = match.group(1) def _init(self): + self.www_claim = "0" + self.csrf_token = util.generate_token() + self._find_tags = re.compile(r"#\w+").findall + self._logged_in = True + self._cursor = None + self._user = None + self.cookies.set( "csrftoken", self.csrf_token, domain=self.cookies_domain) @@ -44,13 +51,6 @@ def _init(self): else: self.api = InstagramRestAPI(self) - self.www_claim = "0" - self.csrf_token = util.generate_token() - self._find_tags = re.compile(r"#\w+").findall - self._logged_in = True - self._cursor = None - self._user = None - def items(self): self.login() From 255d08b79ed7aa18304d1a78badb97ff94cbae17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 28 Jul 2023 16:58:16 +0200 Subject: [PATCH 249/252] add test for 'Extractor.initialize()' (#4359) --- gallery_dl/extractor/mangahere.py | 3 +-- gallery_dl/extractor/tumblr.py | 7 +++++-- test/test_extractor.py | 10 +++++++++- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py index 97c26d473a..745231b1dd 100644 --- a/gallery_dl/extractor/mangahere.py +++ b/gallery_dl/extractor/mangahere.py @@ -114,8 +114,7 @@ class MangahereMangaExtractor(MangahereBase, MangaExtractor): ("https://m.mangahere.co/manga/aria/"), ) - def __init__(self, match): - MangaExtractor.__init__(self, match) + def _init(self): self.cookies.set("isAdult", "1", domain="www.mangahere.cc") def chapters(self, page): diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 12ea39f8c5..9adc3ab137 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -442,10 +442,13 @@ class TumblrDayExtractor(TumblrExtractor): def __init__(self, match): TumblrExtractor.__init__(self, match) year, month, day = match.group(4).split("/") - self.date_min = ts = ( + self.date_min = ( # 719163 == date(1970, 1, 1).toordinal() date(int(year), int(month), int(day)).toordinal() - 719163) * 86400 - self.api.before = ts + 86400 + + def _init(self): + TumblrExtractor._init(self) + self.api.before = self.date_min + 86400 def posts(self): return self.api.posts(self.blog, {}) diff --git a/test/test_extractor.py b/test/test_extractor.py index 6516fa8f62..e328664708 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -132,8 +132,16 @@ def test_unique_pattern_matches(self): else: self.assertIs(extr1, matches[0][1], url) + def test_init(self): + """Test for exceptions in Extractor.initialize(()""" + for cls in extractor.extractors(): + for test in cls._get_tests(): + extr = cls.from_url(test[0]) + extr.initialize() + break + def test_docstrings(self): - """ensure docstring uniqueness""" + """Ensure docstring uniqueness""" for extr1 in extractor.extractors(): for extr2 in extractor.extractors(): if extr1 != extr2 and extr1.__doc__ and extr2.__doc__: From ed21908fda1cb30ecef9823cff6027109152e836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 28 Jul 2023 17:07:25 +0200 Subject: [PATCH 250/252] initial support for child extractor options Using "parent-category>child-category" as extractor category in a config file allows to set options for a child extractor when it was spawned by that parent. For example "reddit>gfycat" to set gfycat options for when it was found in a reddit post. { "extractor": { "gfycat": { "filename": "regular filename" }, "reddit>gfycat": { "filename": "reddit-specific filename" } } } Note: This does currently not work for most imgur links due to how its extractor hierarchy is structured. --- gallery_dl/extractor/common.py | 26 ++++++++++++++------------ gallery_dl/job.py | 15 +++++++++++++++ 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index fc6b197cdc..6b94112c49 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -45,10 +45,6 @@ class Extractor(): def __init__(self, match): self.log = logging.getLogger(self.category) self.url = match.string - - if self.basecategory: - self.config = self._config_shared - self.config_accumulate = self._config_shared_accumulate self._cfgpath = ("extractor", self.category, self.subcategory) self._parentdir = "" @@ -98,16 +94,22 @@ def config_accumulate(self, key): return config.accumulate(self._cfgpath, key) def _config_shared(self, key, default=None): - return config.interpolate_common(("extractor",), ( - (self.category, self.subcategory), - (self.basecategory, self.subcategory), - ), key, default) + return config.interpolate_common( + ("extractor",), self._cfgpath, key, default) def _config_shared_accumulate(self, key): - values = config.accumulate(self._cfgpath, key) - conf = config.get(("extractor",), self.basecategory) - if conf: - values[:0] = config.accumulate((self.subcategory,), key, conf=conf) + first = True + extr = ("extractor",) + + for path in self._cfgpath: + if first: + first = False + values = config.accumulate(extr + path, key) + else: + conf = config.get(extr, path[0]) + if conf: + values[:0] = config.accumulate( + (self.subcategory,), key, conf=conf) return values def request(self, url, method="GET", session=None, diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 7ecdc39134..2ea8feb721 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -32,6 +32,21 @@ def __init__(self, extr, parent=None): self.kwdict = {} self.status = 0 + cfgpath = [] + if parent and parent.extractor.category != extr.category: + cat = "{}>{}".format( + parent.extractor.category, extr.category) + cfgpath.append((cat, extr.subcategory)) + cfgpath.append((extr.category, extr.subcategory)) + if extr.basecategory: + if not cfgpath: + cfgpath.append((extr.category, extr.subcategory)) + cfgpath.append((extr.basecategory, extr.subcategory)) + if cfgpath: + extr._cfgpath = cfgpath + extr.config = extr._config_shared + extr.config_accumulate = extr._config_shared_accumulate + actions = extr.config("actions") if actions: from .actions import parse From 48ef062867da6500cdf08ab6dbc8da7b6024c879 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 29 Jul 2023 13:43:27 +0200 Subject: [PATCH 251/252] fix issues with 'Extractor.finalize()' - prevent crash in InstagramUserExtractor (#4359) - call it at the end of every DownloadJob - add it to tests --- gallery_dl/extractor/common.py | 4 +++- gallery_dl/extractor/instagram.py | 3 +++ gallery_dl/job.py | 3 +-- test/test_extractor.py | 1 + 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 6b94112c49..d960a39609 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -36,7 +36,6 @@ class Extractor(): browser = None root = "" test = None - finalize = None request_interval = 0.0 request_interval_min = 0.0 request_timestamp = 0.0 @@ -66,6 +65,9 @@ def initialize(self): self._init() self.initialize = util.noop + def finalize(self): + pass + def items(self): yield Message.Version, 1 diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 561941cec3..5068ebad95 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -405,6 +405,9 @@ class InstagramUserExtractor(InstagramExtractor): def initialize(self): pass + def finalize(self): + pass + def items(self): base = "{}/{}/".format(self.root, self.item) stories = "{}/stories/{}/".format(self.root, self.item) diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 2ea8feb721..f169788e69 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -140,8 +140,7 @@ def run(self): log.info("No results for %s", extractor.url) finally: self.handle_finalize() - if extractor.finalize: - extractor.finalize() + extractor.finalize() return self.status diff --git a/test/test_extractor.py b/test/test_extractor.py index e328664708..67d55b065b 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -138,6 +138,7 @@ def test_init(self): for test in cls._get_tests(): extr = cls.from_url(test[0]) extr.initialize() + extr.finalize() break def test_docstrings(self): From d50c312ff0352e00cff3ea9ba384de53449de2bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 29 Jul 2023 13:48:31 +0200 Subject: [PATCH 252/252] prevent test failure when there's no 'ytdl' module (#4364) split of ytdl into its own test function and skip it when there's an ImportError similar to test_ytdl.py --- test/test_extractor.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/test_extractor.py b/test/test_extractor.py index 67d55b065b..f8bed13388 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -135,12 +135,25 @@ def test_unique_pattern_matches(self): def test_init(self): """Test for exceptions in Extractor.initialize(()""" for cls in extractor.extractors(): + if cls.category == "ytdl": + continue for test in cls._get_tests(): extr = cls.from_url(test[0]) extr.initialize() extr.finalize() break + def test_init_ytdl(self): + try: + extr = extractor.find("ytdl:") + extr.initialize() + extr.finalize() + except ImportError as exc: + if exc.name in ("youtube_dl", "yt_dlp"): + raise unittest.SkipTest("cannot import module '{}'".format( + exc.name)) + raise + def test_docstrings(self): """Ensure docstring uniqueness""" for extr1 in extractor.extractors():