Skip to content

Commit

Permalink
[deviantart] extract journal HTML from webpage (#6254, #6207, #6196)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikf committed Oct 1, 2024
1 parent fb6be2d commit 7dbd53e
Showing 1 changed file with 17 additions and 2 deletions.
19 changes: 17 additions & 2 deletions gallery_dl/extractor/deviantart.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,15 +369,30 @@ def _extract_journal(self, deviation):
else:
page = self._limited_request(deviation["url"]).text

# extract journal html from webpage
html = text.extr(
page,
"<h2>Literature Text</h2></span><div>",
"</div></section>")
if html:
return {"html": html}

self.log.warning("%s: Failed to extract journal HTML from "
"webpage. Falling back to __INITIAL_STATE__ "
"markup.", deviation["index"])

# parse __INITIAL_STATE__ as fallback
state = util.json_loads(text.extr(
page, 'window.__INITIAL_STATE__ = JSON.parse("', '");')
.replace("\\\\", "\\").replace("\\'", "'").replace('\\"', '"'))

deviation = state["@@entities"]["deviation"].popitem()[1]
content = deviation["textContent"]
deviations = state["@@entities"]["deviation"]
content = deviations.popitem()[1]["textContent"]

html = content["html"]["markup"]
if html.startswith("{"):
self.log.warning("%s: Unsupported '%s' markup.",
deviation["index"], content["html"]["type"])
html = content["excerpt"].replace("\n", "<br />")
return {"html": html}

Expand Down

0 comments on commit 7dbd53e

Please sign in to comment.