diff --git a/README.assets/image-20200414232616854.png b/README.assets/image-20200414232616854.png new file mode 100644 index 0000000..54b30f1 Binary files /dev/null and b/README.assets/image-20200414232616854.png differ diff --git a/README.assets/image-20200414232733377.png b/README.assets/image-20200414232733377.png new file mode 100644 index 0000000..d0c6689 Binary files /dev/null and b/README.assets/image-20200414232733377.png differ diff --git a/README.assets/image-20200414232929882.png b/README.assets/image-20200414232929882.png new file mode 100644 index 0000000..1ab6818 Binary files /dev/null and b/README.assets/image-20200414232929882.png differ diff --git a/README.assets/image-20200414233052905.png b/README.assets/image-20200414233052905.png new file mode 100644 index 0000000..0f83ed9 Binary files /dev/null and b/README.assets/image-20200414233052905.png differ diff --git a/README.md b/README.md index 9f14b2b..433f507 100644 --- a/README.md +++ b/README.md @@ -1 +1,27 @@ -# ngapost2md \ No newline at end of file +# ngapost2md + +快速爬楼存回复人+时间+内容,支持保存正文内的(最高清的)图片(仅附件的图但没有附在正文内的暂时不支持)。 + +支持引用内容的识别(格式可能还有问题),和(部分)ac娘表情的转义(需要到release页面下smile.zip) + +支持增量爬楼 + +## 使用指引 + +1. 把nga.py下下来,修改headers和cookies(cookies是自己账号登录后的具体内容) + +2. 将smile.zip解压,确保smile文件夹(里面就是各种ac娘表情包)和nga.py在同一个目录下 + +3. 双击启动输入tid即可,之后会反显爬楼爬页的情况和下图片的情况 + +4. 最后会在nga.py所在的目录下出一个新的以tid命名的文件夹,里面有post.md直接查看就行。 + +### 图片快速指引 + +![image-20200414232616854](README.assets/image-20200414232616854.png) + +![image-20200414232733377](README.assets/image-20200414232733377.png) + +![image-20200414232929882](README.assets/image-20200414232929882.png) + +![image-20200414233052905](README.assets/image-20200414233052905.png) \ No newline at end of file diff --git a/nga.py b/nga.py new file mode 100644 index 0000000..83f5ee0 --- /dev/null +++ b/nga.py @@ -0,0 +1,137 @@ +# -*- coding: UTF-8 -*- +import re +import requests +import os +import sys +import time +from contextlib import closing +import hashlib + +#=============先修改 +headers = { + 'User-agent': '__'} +cookies = { + 'ngaPassportUid': '__', + 'bbsmisccookies': '__', + 'ngacn0comUserInfo': '__', + 'ngacn0comUserInfoCheck': '__', + 'ngacn0comInfoCheckTime': '__', + 'ngaPassportUrlencodedUname': '__', + 'ngaPassportCid': '__', +} +#=============先修改 +totalfloor = [] +tid = 0 +title = 'title' +localmaxpage = 1 +localmaxfloor = -1 + + +def single(page): + print ('trypage%d' % page) + params = ( + ('tid', tid), + ('_ff', '-7'), + ('page', page) + ) + ss1 = requests.Session() + get = ss1.get('https://bbs.nga.cn/read.php', headers=headers, + params=params, cookies=cookies) + get.encoding = 'GBK' + content = get.text + global title + title = re.search(r'(.+?)', content, flags=re.S).group(1) + userdict = {} + rex = re.findall(r'"uid":(\d+?),"username":"(.+?)",', content, flags=re.S) + for ritem in rex: + userdict[ritem[0]] = ritem[1] + + reply = re.findall(r'func=ucp&uid=(\d+)\' id=\'postauthor.+?\'reply time\'>(.+?).+?<(?:p|span) id=\'postcontent(\d+?)\' class=\'postcontent ubbcode\'>(.+?)', content, flags=re.S) + for i in range(len(reply)): + totalfloor.append([reply[i][2], reply[i][1], userdict[reply[i][0]], reply[i][3]]) + return re.search(r'下一页', content) != None + +def makefile(): + global localmaxfloor + lastfloor = 0 + with open(('.\\\\%d\\post.md' % tid),'a',encoding='utf-8') as f: + for onefloor in totalfloor: + if localmaxfloor','\n')#换行 + raw = raw.replace('
','\n') + + rex = re.findall(r'(?<=\[img\]).+?(?=\[/img\])',raw)#图片 + for ritem in rex: + url = str(ritem) + if url[0:2] == './': + url = 'https://img.nga.178.com/attachments/' + url[2:] + url = url.replace('.medium.jpg','') + filename = hashlib.md5(bytes(url, encoding='utf-8')).hexdigest()[2:8] + url[-6:] + if os.path.exists('.\\\\%d\\%s' % (tid,filename)) == False: + down(url,('.\\\\%d\\%s' % (tid,filename))) + print('down:%s' % ('.\\\\%d\\%s' % (tid,filename))) + raw = raw.replace(('[img]%s[/img]' % ritem),('![img](.\\\\%s)' % filename)) + + rex = re.findall(r'\[s\:(a2|ac)\:(.+?)\]',raw)#表情 + for ritem in rex: + raw = raw.replace('[s:%s:%s]' % (ritem[0], ritem[1]),'![%s](..\\\\smile\\%s.png)' % (ritem[0]+ritem[1],ritem[0]+ritem[1])) + #[0]人名 [1]时间 [2]圈的内容 + rex = re.findall(r'\[quote\].+?\[uid=\d+\](.+?)\[/uid\] \((.+?)\)\:\[/b\](.+?)\[/quote\]',raw, flags=re.S)#引用 [quote][tid=0000000]Topic[/tid] [b]Post by [uid=000000]whowhowho[/uid] (2020-03-26 01:07):[/b] + for ritem in rex: + quotetext = ritem[2] + quotetext = quotetext.replace('\n','\n>') + raw = raw.replace(re.search(r'\[quote\].+?\[uid=\d+\](.+?)\[/uid\] \((.+?)\)\:\[/b\](.+?)\[/quote\]',raw, flags=re.S).group(),'>%s(%s) said:%s' % (ritem[0],ritem[1],quotetext)) + + + f.write(("%s\n\n" % raw)) + lastfloor = int(onefloor[0]) + return lastfloor + +def down(url,path): + with closing(requests.get(url, stream=True)) as response: + chunk_size = 1024 # 单次请求最大值 + with open(path, "wb") as file: + for data in response.iter_content(chunk_size=chunk_size): + file.write(data) + +def main(): + global tid + tid = int(input('tid:')) + holder() + input('press to exit.') +def holder(): + global localmaxpage + global localmaxfloor + print(tid) + if not os.path.exists(('.\\\\%d' % tid)): + os.mkdir(('.\\\\%d' % tid)) + elif os.path.exists('.\\\\%d\\max.txt' % tid): + with open('.\\\\%d\\max.txt' % tid,'r',encoding='utf-8') as f: + r = f.read() + localmaxpage = int(r.split( )[0]) + localmaxfloor = int(r.split( )[1]) + + print('localmaxpage%d\nlocalmaxfloor%d' % (localmaxpage,localmaxfloor)) + cpage = localmaxpage + while single(cpage) != False: + cpage = cpage + 1 + + with open(('.\\\\%d\\max.txt' % tid),'w',encoding='utf-8') as f: + f.write("%d %s" % (cpage, totalfloor[len(totalfloor) - 1][0])) + + if os.path.exists('.\\\\%d\\info.txt' % tid): + with open(('.\\\\%d\\info.txt' % tid),'a',encoding='utf-8') as f: + f.write('[%s]%d\n' % (time.asctime(time.localtime(time.time())), len(totalfloor) - 1)) + else: + with open(('.\\\\%d\\info.txt' % tid),'w',encoding='utf-8') as f: + f.write('tid:%d\n' % tid) + f.write(('[%s]%d\n' % (time.asctime(time.localtime(time.time())), len(totalfloor) - 1))) + + print('makeuntil:%d' % makefile()) + +if __name__ == '__main__': + main() \ No newline at end of file