Fix: eight

eight04 · Nov 13, 2024 · 0009597 · 0009597
1 parent 898fa76
commit 0009597
Show file tree

Hide file tree

Showing 3 changed files with 104 additions and 71 deletions.
diff --git a/comiccrawler/mods/eight.py b/comiccrawler/mods/eight.py
@@ -9,24 +9,25 @@
 import re
 from urllib.parse import urljoin
 
-from deno_vm import VM
+from deno_vm import VM, eval
 
 from ..core import Episode, grabhtml
 from ..util import clean_tags
-from ..url import update_qs
 
 domain = ["www.8comic.com", "www.comicvip.com", "comicbus.com", "www.comicabc.com"]
 name = "無限"
 next_page_cache = {}
 nview = None
 
 def get_title(html, url):
-	return re.search('addhistory\("\d+","([^"]+)',html).group(1)
+	return re.search(r'addhistory\("\d+","([^"]+)',html).group(1)
 
 def get_episodes(html, url):
 	html = html.replace("\n", "")
 
+	comicview_js = grabhtml(urljoin(url, "/js/comicview.js"))
 	js = """
+	function cview(...args) {
 		var output;
 		function getCookie() {}
 		function getcookie() {}
@@ -35,94 +36,126 @@ def get_episodes(html, url):
 				output = result;
 			}
 		};
-		var document = {
-			location: {
-				href: ""
-			}
-		};
-	""" + grabhtml(urljoin(url, "/js/comicview.js"))
-
+		const location = {set href(url) {output = url;}};
+		const document = {location};
+		const $ = () => $;
+		$.attr = $.html = $.text = $;
+		const addch = () => {};
+		""" + comicview_js + """
+		cview(...args);
+		return output;
+	}
+	"""
+
 	s = []
 	matches = re.finditer(
 		r'<a [^>]*?onclick="(cview[^"]+?);[^>]*>(.+?)</a>',
 		html, re.M
 	)
-	with VM(js) as vm:
+	with VM() as vm:
+		vm.run(js)
 		for match in matches:
 			cview, title = match.groups()
-
-			vm.run(cview)
-			ep_url = vm.run("output")
+			if "this" in cview:
+				continue
+
+			ep_url = vm.run(cview)
+			# ep_url = vm.run("location.href")
+			print("ep_url", ep_url)
 			title = clean_tags(title)
 
 			e = Episode(title, urljoin(url, ep_url))
 			s.append(e)
 	return s
+
+j_js = ""
+lazy_js = ""
 
 def get_images(html, url):
-	global nview
-	if not nview:
-		nview = re.search('src="([^"]*nview\.js[^"]*)"', html).group(1)
-		nview = urljoin(url, nview)
-		nview = grabhtml(nview)
+	global j_js
+	if not j_js:
+		j_js = re.search(r'src="([^"]*/j\.js[^"]*)"', html).group(1)
+		j_js = urljoin(url, j_js)
+		j_js = grabhtml(j_js)
 
 	try:
 		# http://www.comicbus.com/html/103.html
-		script = re.search('(var ch=.+?)spp\(\)', html, re.DOTALL).group(1)
+		script = re.search(r'(var ch=.+?)spp\(\)', html, re.DOTALL).group(1)
 	except AttributeError:
 		# http://www.comicbus.com/html/7294.html
 		script = re.search('(var chs=.+?)</script>', html, re.DOTALL).group(1)
+
+	global lazy_js
+	if not lazy_js:
+		lazy_js = re.search(r'src="([^"]*/lazyloadx\.js[^"]*)"', html).group(1)
+		lazy_js = urljoin(url, lazy_js)
+		lazy_js = grabhtml(lazy_js)
+		lazy_js = re.search(r'(var a=[\s\S]*?)o\.setAttribute', lazy_js).group(1)
 
 	js = """
-	var url,
-		images = [],
-		document = {
-			location: {
-				toString() {return url;},
-				get href() {return url;},
-				set href(_url) {url = _url;}
-			},
-			getElementById() {
-				return {
-					set src(value) {
-						images.push(value);
-					},
-					style: {}
-				};
-			}
-		},
-		navigator = {
-			userAgent: "",
-			language: ""
-		},
-		window = {},
-		alert = () => {};
+(() => {
+var url = """ + f"{url!r}" + """,
+  images = [],
+  document = {
+    documentElement: {},
+    location: {
+      toString() {
+        return url;
+      },
+      get href() {
+        return url;
+      },
+      set href(_url) {
+        url = _url;
+      },
+    },
+    getElementById() {
+      return {
+        set src(value) {
+          images.push(value);
+        },
+        style: {},
+      };
+    },
+  },
+  navigator = {
+    userAgent: "",
+    language: "",
+  },
+  window = { location: document.location,
+  document},
+  alert = () => {},
+  localStorage = {
+    getItem() {
+      return null;
+    },
+    setItem() {},
+  },
+  $ = () => $,
+  ps,
+  ci,
+  pi,
+  ni,
+  vv = "",
+  src;
+$.attr = $.ready = $.on = $.click = $.hide = $.show = $.css = $.html = $.append = $.get = $.ajax = $.post = $;
+
+""" + j_js + script + """
 		
-	function scriptBody() {
-		initpage = () => {};
-	""" + nview + script + """
-		return [images[0], p, ps, ch];
-	}
-	
-	function getImages(url) {
-		images = [];
-		document.location.href = url;
-		return scriptBody();
-	}
-	"""
-
-	with VM(js) as vm:
-		img, p, ps, ch = vm.call("getImages", url)
-	if p < ps:
-		if "/ReadComic/" in url:
-			# https://www.comicabc.com/ReadComic/6997/734/734_8d00xI27S.html?p=2
-			next_page_cache[url] = update_qs(url, {"p": p + 1})
-		else:
-			# https://www.comicabc.com/online/new-18117.html?ch=122-2
-			next_page_cache[url] = update_qs(url, {"ch": f"{ch}-{p + 1}"})
-
+function *parseSrc() {
+  const rx = / s="([^"]+)"/g;
+  while ((m = rx.exec(xx))) {
+    yield m[1];
+  }
+}
 
-	return urljoin(url, img)
+return [...parseSrc()].map(src => {
+	""" + lazy_js + """
+	return unescape(src)
+});
+
+})();
+"""
+	imgs = eval(js)
+	return imgs
 
-def get_next_page(html, url):
-	return next_page_cache.pop(url, None)
diff --git a/comiccrawler/mods/facebook.py b/comiccrawler/mods/facebook.py
@@ -29,7 +29,7 @@ def get_title(html, url):
 	except AttributeError:
 		id = re.search("set=([^&]+)", url).group(1)
 	title = re.search("<title[^>]*>([^<]+)", html).group(1)
-	title = re.sub("\s+", " ", title)
+	title = re.sub(r"\s+", " ", title)
 	return unescape("{} ({})".format(title, id))
 
 def get_episodes(html, url):

diff --git a/comiccrawler/mods/sankaku_beta.py b/comiccrawler/mods/sankaku_beta.py
@@ -73,7 +73,7 @@ def get_episodes(html, url):
     return eps[::-1]
 
 def get_images(html, url):
-    id = re.search("post/show/(\d+)", url).group(1)
+    id = re.search(r"post/show/(\d+)", url).group(1)
     data = grabhtml("https://capi-v2.sankakucomplex.com/posts?lang=english&page=1&limit=1&tags=id_range:{}".format(id))
     data = json.loads(data)
     return data[0]["file_url"]