-
Notifications
You must be signed in to change notification settings - Fork 148
/
metadata.py
148 lines (135 loc) · 4.97 KB
/
metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#encoding:utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import traceback
import pygeoip
import threading
import socket
import sys
import hashlib
import datetime
import time
import json
import metautils
from bencode import bencode, bdecode
geoip = pygeoip.GeoIP('GeoIP.dat')
def decode(encoding, s):
if type(s) is list:
s = ';'.join(s)
u = s
for x in (encoding, 'utf8', 'gbk', 'big5'):
try:
u = s.decode(x)
return u
except:
pass
return s.decode(encoding, 'ignore')
def decode_utf8(encoding, d, i):
if i+'.utf-8' in d:
return d[i+'.utf-8'].decode('utf8')
return decode(encoding, d[i])
def parse_metadata(data):
info = {}
encoding = 'utf8'
try:
torrent = bdecode(data)
if not torrent.get('name'):
return None
except:
return None
try:
info['create_time'] = datetime.datetime.fromtimestamp(float(torrent['creation date']))
except:
info['create_time'] = datetime.datetime.utcnow()
if torrent.get('encoding'):
encoding = torrent['encoding']
if torrent.get('announce'):
info['announce'] = decode_utf8(encoding, torrent, 'announce')
if torrent.get('comment'):
info['comment'] = decode_utf8(encoding, torrent, 'comment')[:200]
if torrent.get('publisher-url'):
info['publisher-url'] = decode_utf8(encoding, torrent, 'publisher-url')
if torrent.get('publisher'):
info['publisher'] = decode_utf8(encoding, torrent, 'publisher')
if torrent.get('created by'):
info['creator'] = decode_utf8(encoding, torrent, 'created by')[:15]
if 'info' in torrent:
detail = torrent['info']
else:
detail = torrent
info['name'] = decode_utf8(encoding, detail, 'name')
if 'files' in detail:
info['files'] = []
for x in detail['files']:
if 'path.utf-8' in x:
v = {'path': decode(encoding, '/'.join(x['path.utf-8'])), 'length': x['length']}
else:
v = {'path': decode(encoding, '/'.join(x['path'])), 'length': x['length']}
if 'filehash' in x:
v['filehash'] = x['filehash'].encode('hex')
info['files'].append(v)
info['length'] = sum([x['length'] for x in info['files']])
else:
info['length'] = detail['length']
info['data_hash'] = hashlib.md5(detail['pieces']).hexdigest()
if 'profiles' in detail:
info['profiles'] = detail['profiles']
return info
def save_metadata(dbcurr, binhash, address, start_time, data):
utcnow = datetime.datetime.utcnow()
name = threading.currentThread().getName()
try:
info = parse_metadata(data)
if not info:
return
except:
traceback.print_exc()
return
info_hash = binhash.encode('hex')
info['info_hash'] = info_hash
# need to build tags
info['tagged'] = False
info['classified'] = False
info['requests'] = 1
info['last_seen'] = utcnow
info['source_ip'] = address[0]
if info.get('files'):
files = [z for z in info['files'] if not z['path'].startswith('_')]
if not files:
files = info['files']
else:
files = [{'path': info['name'], 'length': info['length']}]
files.sort(key=lambda z:z['length'], reverse=True)
bigfname = files[0]['path']
info['extension'] = metautils.get_extension(bigfname).lower()
info['category'] = metautils.get_category(info['extension'])
if info['category'] == u'安装包':
pass
elif info['category'] == u'压缩文件':
pass
elif info['category'] == u'图像':
pass
elif info['category'] == u'文档书籍':
pass
if 'files' in info:
try:
dbcurr.execute('INSERT INTO search_filelist VALUES(%s, %s)', (info['info_hash'], json.dumps(info['files'])))
except:
print name, 'insert error', sys.exc_info()[1]
del info['files']
try:
try:
print '\n', datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),'Saved', info['info_hash'], info['name'], (time.time()-start_time), 's', address[0], geoip.country_name_by_addr(address[0]),
except:
print '\n',datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'Saved', info['info_hash'], sys.exc_info()[1]
ret = dbcurr.execute('INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' +
'length,create_time,last_seen,requests,comment,creator) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
(info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'],
info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'],
info.get('comment',''), info.get('creator','')))
dbcurr.connection.commit()
except:
print name, 'save error', info
traceback.print_exc()
return