Skip to content

Commit

Permalink
Add code
Browse files Browse the repository at this point in the history
  • Loading branch information
ludoux committed Apr 14, 2020
1 parent b74ef81 commit 4f8f601
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 1 deletion.
Binary file added README.assets/image-20200414232616854.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added README.assets/image-20200414232733377.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added README.assets/image-20200414232929882.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added README.assets/image-20200414233052905.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
28 changes: 27 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,27 @@
# ngapost2md
# ngapost2md

快速爬楼存回复人+时间+内容,支持保存正文内的(最高清的)图片(仅附件的图但没有附在正文内的暂时不支持)。

支持引用内容的识别(格式可能还有问题),和(部分)ac娘表情的转义(需要到release页面下smile.zip)

支持增量爬楼

## 使用指引

1. 把nga.py下下来,修改headers和cookies(cookies是自己账号登录后的具体内容)

2. 将smile.zip解压,确保smile文件夹(里面就是各种ac娘表情包)和nga.py在同一个目录下

3. 双击启动输入tid即可,之后会反显爬楼爬页的情况和下图片的情况

4. 最后会在nga.py所在的目录下出一个新的以tid命名的文件夹,里面有post.md直接查看就行。

### 图片快速指引

![image-20200414232616854](README.assets/image-20200414232616854.png)

![image-20200414232733377](README.assets/image-20200414232733377.png)

![image-20200414232929882](README.assets/image-20200414232929882.png)

![image-20200414233052905](README.assets/image-20200414233052905.png)
137 changes: 137 additions & 0 deletions nga.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# -*- coding: UTF-8 -*-
import re
import requests
import os
import sys
import time
from contextlib import closing
import hashlib

#=============先修改
headers = {
'User-agent': '__'}
cookies = {
'ngaPassportUid': '__',
'bbsmisccookies': '__',
'ngacn0comUserInfo': '__',
'ngacn0comUserInfoCheck': '__',
'ngacn0comInfoCheckTime': '__',
'ngaPassportUrlencodedUname': '__',
'ngaPassportCid': '__',
}
#=============先修改
totalfloor = []
tid = 0
title = 'title'
localmaxpage = 1
localmaxfloor = -1


def single(page):
print ('trypage%d' % page)
params = (
('tid', tid),
('_ff', '-7'),
('page', page)
)
ss1 = requests.Session()
get = ss1.get('https://bbs.nga.cn/read.php', headers=headers,
params=params, cookies=cookies)
get.encoding = 'GBK'
content = get.text
global title
title = re.search(r'<title>(.+?)</title>', content, flags=re.S).group(1)
userdict = {}
rex = re.findall(r'"uid":(\d+?),"username":"(.+?)",', content, flags=re.S)
for ritem in rex:
userdict[ritem[0]] = ritem[1]

reply = re.findall(r'func=ucp&uid=(\d+)\' id=\'postauthor.+?\'reply time\'>(.+?)</span></div>.+?<(?:p|span) id=\'postcontent(\d+?)\' class=\'postcontent ubbcode\'>(.+?)</(?:p|span)>', content, flags=re.S)
for i in range(len(reply)):
totalfloor.append([reply[i][2], reply[i][1], userdict[reply[i][0]], reply[i][3]])
return re.search(r'下一页', content) != None

def makefile():
global localmaxfloor
lastfloor = 0
with open(('.\\\\%d\\post.md' % tid),'a',encoding='utf-8') as f:
for onefloor in totalfloor:
if localmaxfloor<int(onefloor[0]):

f.write("----\n##### %s. %s by %s\n" % (onefloor[0], onefloor[1], onefloor[2]))
raw = str(onefloor[3])

raw = raw.replace('<br/>','\n')#换行
raw = raw.replace('<br>','\n')

rex = re.findall(r'(?<=\[img\]).+?(?=\[/img\])',raw)#图片
for ritem in rex:
url = str(ritem)
if url[0:2] == './':
url = 'https://img.nga.178.com/attachments/' + url[2:]
url = url.replace('.medium.jpg','')
filename = hashlib.md5(bytes(url, encoding='utf-8')).hexdigest()[2:8] + url[-6:]
if os.path.exists('.\\\\%d\\%s' % (tid,filename)) == False:
down(url,('.\\\\%d\\%s' % (tid,filename)))
print('down:%s' % ('.\\\\%d\\%s' % (tid,filename)))
raw = raw.replace(('[img]%s[/img]' % ritem),('![img](.\\\\%s)' % filename))

rex = re.findall(r'\[s\:(a2|ac)\:(.+?)\]',raw)#表情
for ritem in rex:
raw = raw.replace('[s:%s:%s]' % (ritem[0], ritem[1]),'![%s](..\\\\smile\\%s.png)' % (ritem[0]+ritem[1],ritem[0]+ritem[1]))
#[0]人名 [1]时间 [2]圈的内容
rex = re.findall(r'\[quote\].+?\[uid=\d+\](.+?)\[/uid\] \((.+?)\)\:\[/b\](.+?)\[/quote\]',raw, flags=re.S)#引用 [quote][tid=0000000]Topic[/tid] [b]Post by [uid=000000]whowhowho[/uid] (2020-03-26 01:07):[/b]
for ritem in rex:
quotetext = ritem[2]
quotetext = quotetext.replace('\n','\n>')
raw = raw.replace(re.search(r'\[quote\].+?\[uid=\d+\](.+?)\[/uid\] \((.+?)\)\:\[/b\](.+?)\[/quote\]',raw, flags=re.S).group(),'>%s(%s) said:%s' % (ritem[0],ritem[1],quotetext))


f.write(("%s\n\n" % raw))
lastfloor = int(onefloor[0])
return lastfloor

def down(url,path):
with closing(requests.get(url, stream=True)) as response:
chunk_size = 1024 # 单次请求最大值
with open(path, "wb") as file:
for data in response.iter_content(chunk_size=chunk_size):
file.write(data)

def main():
global tid
tid = int(input('tid:'))
holder()
input('press to exit.')
def holder():
global localmaxpage
global localmaxfloor
print(tid)
if not os.path.exists(('.\\\\%d' % tid)):
os.mkdir(('.\\\\%d' % tid))
elif os.path.exists('.\\\\%d\\max.txt' % tid):
with open('.\\\\%d\\max.txt' % tid,'r',encoding='utf-8') as f:
r = f.read()
localmaxpage = int(r.split( )[0])
localmaxfloor = int(r.split( )[1])

print('localmaxpage%d\nlocalmaxfloor%d' % (localmaxpage,localmaxfloor))
cpage = localmaxpage
while single(cpage) != False:
cpage = cpage + 1

with open(('.\\\\%d\\max.txt' % tid),'w',encoding='utf-8') as f:
f.write("%d %s" % (cpage, totalfloor[len(totalfloor) - 1][0]))

if os.path.exists('.\\\\%d\\info.txt' % tid):
with open(('.\\\\%d\\info.txt' % tid),'a',encoding='utf-8') as f:
f.write('[%s]%d\n' % (time.asctime(time.localtime(time.time())), len(totalfloor) - 1))
else:
with open(('.\\\\%d\\info.txt' % tid),'w',encoding='utf-8') as f:
f.write('tid:%d\n' % tid)
f.write(('[%s]%d\n' % (time.asctime(time.localtime(time.time())), len(totalfloor) - 1)))

print('makeuntil:%d' % makefile())

if __name__ == '__main__':
main()

0 comments on commit 4f8f601

Please sign in to comment.