Skip to content

Commit

Permalink
Change: switch to curl_cffi, fix oh
Browse files Browse the repository at this point in the history
  • Loading branch information
eight04 committed Nov 20, 2024
1 parent 675f456 commit 4418326
Show file tree
Hide file tree
Showing 12 changed files with 101 additions and 84 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ dist

.venv
test*.*
temp
2 changes: 1 addition & 1 deletion comiccrawler/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def analyze_pages(self):
print('Analyzing {}...'.format(url))
sleep(getattr(self.mission.module, "rest_analyze", 0))
r = urlparse(self.mission.url)
self.html = self.grabber.html(url, retry=True, header={
self.html = self.grabber.html(url, retry=True, headers={
"Referer": self.mission.url,
"Origin": f"{r.scheme}://{r.netloc}"
})
Expand Down
2 changes: 1 addition & 1 deletion comiccrawler/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def get_html(self):
self.html = True
else:
r = urlparse(self.mission.url)
self.html = self.downloader.html(self.ep.current_url, header={
self.html = self.downloader.html(self.ep.current_url, headers={
"Referer": self.mission.url,
"Origin": f"{r.scheme}://{r.netloc}"
})
Expand Down
3 changes: 2 additions & 1 deletion comiccrawler/error.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from requests import HTTPError
# from requests import HTTPError
from curl_cffi.requests.exceptions import HTTPError

class ComicCrawlerSignal(BaseException):
"""Extend BaseException."""
Expand Down
58 changes: 14 additions & 44 deletions comiccrawler/grabber.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
from threading import Lock
from urllib.parse import quote, urlsplit, urlunsplit, urlparse
import re
import socket
# import socket
import time
import json

import enlighten
import requests
from worker import WorkerExit, async_, await_, sleep, Defer
from urllib3.util import is_fp_closed
# from urllib3.util import is_fp_closed
from urllib3.exceptions import IncompleteRead
from curl_cffi.requests.exceptions import HTTPError

from .config import setting
from .io import content_write
Expand Down Expand Up @@ -74,24 +74,19 @@ def grabber_log(obj):
content = time.strftime("%Y-%m-%dT%H:%M:%S%z") + "\n" + json.dumps(obj, indent=2, sort_keys=True) + "\n\n"
content_write(profile("grabber.log"), content, append=True)

def grabber(url, header=None, *, referer=None, cookie=None,
retry=False, done=None, proxy=None, **kwargs):
def grabber(url, *, referer=None, retry=False, done=None, proxy=None, **kwargs):
"""Request url, return text or bytes of the content."""
s = session_manager.get(url)

if referer:
s.headers['Referer'] = quote_unicode(referer)

if cookie:
quote_unicode_dict(cookie)
requests.utils.add_dict_to_cookiejar(s.cookies, cookie)

if isinstance(proxy, str):
proxies = {'http': proxy, 'https': proxy}
else:
proxies = proxy

r = await_(do_request, s, url, proxies, retry, headers=header, **kwargs)
r = await_(do_request, s, url, proxies, retry, **kwargs)

if done:
done(s, r)
Expand All @@ -116,27 +111,13 @@ def do_request(s, url, proxies, retry, **kwargs):
})

if r.status_code in SUCCESS_CODES:
content_length = r.headers.get("Content-Length")
if not kwargs.get("stream", False) and content_length and int(content_length) != r.raw.tell():
raise ValueError(
"incomplete response. Content-Length: {content_length}, got: {actual}"
.format(content_length=content_length, actual=r.raw.tell())
)
break
if not retry or r.status_code not in RETRYABLE_HTTP_CODES:
r.raise_for_status()
# 302 error without location header
if r.status_code == 302:
# pylint: disable=protected-access
match = re.search(
r"^location:\s*(.+)",
str(r.raw._original_response.msg),
re.M + re.I
)
if not match:
raise TypeError("status 302 without location header")
url = match.group(1)
continue
raise TypeError("status 302 without location header")
print(r)
print("retry after {sleep_time} seconds".format(sleep_time=sleep_time))
sleep(sleep_time)
Expand All @@ -160,19 +141,9 @@ def guess_encoding(r):

def iter_content(r):
"""Iterate the content of the response."""
# FIXME: requests streaming is so broken wtf
# https://github.com/psf/requests/issues/5536
# https://github.com/urllib3/urllib3/issues/2123
if r.raw.chunked and r.raw.supports_chunked_reads():
yield from r.raw.read_chunked(decode_content=True)
else:
while not is_fp_closed(r.raw._fp) or len(r.raw._decoded_buffer) > 0: # pylint: disable=protected-access
b = r.raw.read1(decode_content=True)
yield b
if not b:
sleep(0.1)
yield from r.iter_content()

def grabimg(*args, on_opened=None, tempfile=None, header=None, **kwargs):
def grabimg(*args, on_opened=None, tempfile=None, headers=None, **kwargs):
"""Grab the image. Return ImgResult"""
kwargs["stream"] = True
loaded = 0
Expand All @@ -182,12 +153,12 @@ def grabimg(*args, on_opened=None, tempfile=None, header=None, **kwargs):
except FileNotFoundError:
pass
if loaded:
if not header:
header = {}
header["Range"] = f"bytes={loaded}-"
if not headers:
headers = {}
headers["Range"] = f"bytes={loaded}-"
try:
r = grabber(*args, header=header, **kwargs)
except requests.HTTPError as err:
r = grabber(*args, headers=headers, **kwargs)
except HTTPError as err:
if err.response.status_code != 416:
raise err
try:
Expand Down Expand Up @@ -227,8 +198,7 @@ def _():
counter.update(len(chunk))
loaded += len(chunk)
except WorkerExit:
socket.close(r.raw._fp.fileno()) # pylint: disable=protected-access
r.raw.release_conn()
r.close()
raise
if total and loaded < total:
raise IncompleteRead(loaded, total - loaded)
Expand Down
70 changes: 56 additions & 14 deletions comiccrawler/mods/oh.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from ..core import Episode, grabhtml

domain = ["www.ohmanhua.com", "www.cocomanhua.com"]
domain = ["www.ohmanhua.com", "www.cocomanhua.com", "www.colamanga.com"]
name = "OH漫畫"

def get_title(html, url):
Expand Down Expand Up @@ -48,7 +48,7 @@ def __str__(self):
scripts = ScriptCache()

def get_images(html, url):
cdata = re.search("var C_DATA='[^']+'", html).group(0)
cdata = re.search("var C_DATA=('[^']+')", html).group(1)

scripts.fetch(html, url, [
"\/l\.js",
Expand All @@ -58,31 +58,71 @@ def get_images(html, url):
])

code = """
const _log = console.log;
Function.prototype.toString = (function(_toString) {
return function() {
return _toString.apply(this, arguments).replace(/\\r?\\n/g, '');
}
})(Function.prototype.toString);
self.setInterval = function() {};
self.eval = function(_eval) {
return function() {
_log('eval', arguments[0]);
return _eval.apply(this, arguments);
};
}(self.eval);
self.convertWordArrayToUint8Array =
self.convertUint8ArrayToWordArray =
self.__b_a =
self.__cad =
self.__js =
undefined;
(function() {
let _cookies = "";
function noop(path = "") {
if (path === "document.cookie") return "";
if (path === "document.cookie") return _cookies;
if (path === "$.inArray") return (v, a) => a.indexOf(v);
return new Proxy(() => {}, {
apply: () => noop("?"),
get: (target, prop) => noop(`${path}.${prop}`)
apply: () => noop(`${path}.called`),
get: (target, prop) => {
const propPath = typeof prop == "symbol" ? `${path}.${String(prop)}` : `${path}.${prop}`;
if (propPath == "document.domain") return "www.colamanga.com";
_log("get", propPath);
return noop(propPath);
},
set: (target, prop, value) => {
const propPath = `${path}.${prop}`;
if (propPath == "document.cookie") {
_cookies += value.split(";")[0] + "; ";
}
_log(propPath, value);
return value;
}
});
}
const exports = undefined;
const window = global;
window.location = {
self.window = self;
self.location = {
protocol: "http://",
href: '""" + url + """'
}
const navigator = {
self.navigator = {
userAgent: ""
};
const document = noop("document")
const $ = noop("$");
self.document = noop("document")
self.$ = noop("$");
self.devtools = noop("devtools");
self.localStorage = noop("localStorage");
""" + cdata + "\n" + str(scripts) + """
self.C_DATA = """ + cdata + "\n" + str(scripts) + """
window.use_domain = {
},
Expand All @@ -108,8 +148,10 @@ class Image {
__cr.preLoadImg(i++)
} while (dirty);
return imgs;
}).call(global);
}).call(self);
"""

# import pathlib
# pathlib.Path("oh0.mjs").write_text(code, encoding="utf-8")
imgs = eval(code)
return [urljoin(url, i) for i in imgs]
4 changes: 2 additions & 2 deletions comiccrawler/mods/setnmh.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def get_episodes(html, url):
"order_by": "1",
"chapter_type": "1"
},
header = {
headers = {
"X-Requested-With": "XMLHttpRequest"
}
)
Expand Down Expand Up @@ -74,7 +74,7 @@ def get_images(html, url):
"chapter_id": chapter_id,
"page": page
},
header = {
headers = {
"X-Requested-With": "XMLHttpRequest"
}
)
Expand Down
27 changes: 14 additions & 13 deletions comiccrawler/module_grabber.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from requests.utils import dict_from_cookiejar
# from requests.utils import dict_from_cookiejar

from .grabber import grabhtml, grabimg

Expand All @@ -19,8 +19,8 @@ def img(self, url, **kwargs):

def grab(self, grab_method, url=None, **kwargs):
new_kwargs = {
"header": self.get_header(),
"cookie": purify_cookie(self.get_cookie()),
"headers": self.get_header(),
"cookies": purify_cookie(self.get_cookie()),
"done": self.handle_grab,
"proxy": self.mod.config.get("proxy"),
"verify": self.mod.config.getboolean("verify", True)
Expand Down Expand Up @@ -50,13 +50,14 @@ def get_cookie(self):
return cookie

def handle_grab(self, session, _response):
cookie = dict_from_cookiejar(session.cookies)
config = getattr(self.mod, "config", None)
if not config:
return

for key in config:
if key.startswith("cookie_"):
name = key[7:]
if name in cookie:
config[key] = cookie[name]
pass
# cookie = dict_from_cookiejar(session.cookies)
# config = getattr(self.mod, "config", None)
# if not config:
# return
#
# for key in config:
# if key.startswith("cookie_"):
# name = key[7:]
# if name in cookie:
# config[key] = cookie[name]
2 changes: 1 addition & 1 deletion comiccrawler/session_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from threading import Lock
from typing import Callable, Any

from requests import Session as RequestsSession
from curl_cffi.requests import Session as RequestsSession

from .util import extract_curl

Expand Down
7 changes: 5 additions & 2 deletions comiccrawler/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import string
from functools import total_ordering
from pathlib import Path
from requests.cookies import RequestsCookieJar
from http.cookiejar import CookieJar

import uncurl
import curl_cffi.requests.cookies

def dump(html):
Path("dump.html").write_text(html, encoding="utf-8")
Expand Down Expand Up @@ -100,7 +101,9 @@ def balance(s: str, index: int, left="(", right=")", skip=0):

return s[start:end]

def get_cookie(cookie_jar: RequestsCookieJar, name, domain=None) -> str:
def get_cookie(cookie_jar: CookieJar | curl_cffi.requests.cookies.Cookies, name, domain=None) -> str:
if hasattr(cookie_jar, "jar"):
cookie_jar = cookie_jar.jar
l = [cookie for cookie in cookie_jar if cookie.name == name]
def key(cookie):
if not domain or not cookie.domain:
Expand Down
Loading

0 comments on commit 4418326

Please sign in to comment.