forked from internetarchive/warctools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
warcunpack_ia.py
executable file
·221 lines (163 loc) · 7.51 KB
/
warcunpack_ia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#!/usr/bin/env python
"""warcextract - dump warc record context to directory"""
from __future__ import print_function
import os
import sys
import os.path
import uuid
import mimetypes
import shlex
from optparse import OptionParser
from contextlib import closing
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
from hanzo.warctools import ArchiveRecord, WarcRecord
from hanzo.httptools import RequestMessage, ResponseMessage
mimetypes.add_type('text/javascript', 'js')
parser = OptionParser(usage="%prog [options] warc offset")
parser.add_option("-D", "--default-name", dest="default_name")
parser.add_option("-o", "--output", dest="output")
parser.add_option("-l", "--log", dest="log_file")
parser.add_option("-W", "--wayback_prefix", dest="wayback")
parser.set_defaults(output=None, log_file=None, default_name='crawlerdefault', wayback="http://wayback.archive-it.org/")
def log_headers(log_file):
print('>>warc_file\twarc_id\twarc_type\twarc_content_length\twarc_uri_date\twarc_subject_uri\turi_content_type\toutfile\twayback_uri', file=log_file)
def log_entry(log_file, input_file, record, content_type, output_file, wayback_uri):
log = (input_file, record.id, record.type, record.content_length, record.date, record.url, content_type, output_file, wayback_uri)
print("\t".join(str(s) for s in log), file=log_file)
def main(argv):
(options, args) = parser.parse_args(args=argv[1:])
out = sys.stdout
if options.output:
if not os.path.exists(options.output):
os.makedirs(options.output)
output_dir = options.output
else:
output_dir = os.getcwd()
collisions = 0
if len(args) < 1:
log_file = sys.stdout if not options.log_file else open(options.log_file, 'wb')
log_headers(log_file)
with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh:
collisions += unpack_records('<stdin>', fh, output_dir, options.default_name, log_file, options.wayback)
else:
for filename in args:
log_file = os.path.join(output_dir, os.path.basename(filename)+ '.index.txt') if not options.log_file else options.log_file
log_file = open(log_file, 'wb')
log_headers(log_file)
try:
with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh:
collisions+=unpack_records(filename, fh, output_dir, options.default_name, log_file, options.wayback)
except Exception as e:
print("exception in handling", filename, e, file=sys.stderr)
if collisions:
print(collisions, "filenames that collided", file=sys.stderr)
return 0
def unpack_records(name, fh, output_dir, default_name, output_log, wayback_prefix):
collectionId = ''
collisions = 0
for (offset, record, errors) in fh.read_records(limit=None):
if record:
try:
content_type, content = record.content
if record.type == WarcRecord.WARCINFO:
info = parse_warcinfo(record)
for entry in shlex.split(info.get('description', "")):
if entry.startswith('collectionId'):
collectionId = entry.split('=',1)[1].split(',')[0]
if not collectionId:
filename = record.get_header("WARC-Filename")
if filename:
collectionId = filename.split(r'-')[1]
elif '-' in name:
collectionId = name.split(r'-')[1]
if record.type == WarcRecord.RESPONSE and content_type.startswith('application/http'):
code, mime_type, message = parse_http_response(record)
if 200 <= code < 300:
filename, collision = output_file(output_dir, record.url, mime_type, default_name)
if collision:
collisions+=1
wayback_uri = ''
if collectionId:
wayback_date = record.date.translate(None,r'TZ:-')
wayback_uri = wayback_prefix + collectionId + '/' + wayback_date + '/' + record.url
with open(filename, 'wb') as out:
out.write(message.get_body())
log_entry(output_log, name, record, mime_type, filename, wayback_uri)
except Exception as e:
import traceback; traceback.print_exc()
print("exception in handling record", e, file=sys.stderr)
elif errors:
print("warc errors at %s:%d"%(name, offset if offset else 0), end=' ', file=sys.stderr)
for e in errors:
print(e, end=' ', file=sys.stderr)
print(file=sys.stderr)
return collisions
def parse_warcinfo(record):
info = {}
try:
for line in record.content[1].split('\n'):
line = line.strip()
if line:
try:
key, value =line.split(':',1)
info[key]=value
except Exception as e:
print('malformed warcinfo line', line, file=sys.stderr)
except Exception as e:
print('exception reading warcinfo record', e, file=sys.stderr)
return info
def parse_http_response(record):
message = ResponseMessage(RequestMessage())
remainder = message.feed(record.content[1])
message.close()
if remainder or not message.complete():
if remainder:
print('warning: trailing data in http response for', record.url, file=sys.stderr)
if not message.complete():
print('warning: truncated http response for', record.url, file=sys.stderr)
header = message.header
mime_type = [v for k,v in header.headers if k.lower() =='content-type']
if mime_type:
mime_type = mime_type[0].split(';')[0]
else:
mime_type = None
return header.code, mime_type, message
def output_file(output_dir, url, mime_type, default_name):
clean_url = "".join((c if c.isalpha() or c.isdigit() or c in '_-/.' else '_') for c in url.replace('://','/',1))
parts = clean_url.split('/')
directories, filename = parts[:-1], parts[-1]
path = [output_dir]
for d in directories:
if d:
path.append(d)
if filename:
name, ext = os.path.splitext(filename)
else:
name, ext = default_name, ''
if mime_type:
guess_type = mimetypes.guess_type(url)
# preserve variant file extensions, rather than clobber with default for mime type
if not ext or guess_type != mime_type:
mime_ext = mimetypes.guess_extension(mime_type)
if mime_ext:
ext = mime_ext
elif not ext:
ext = '.html' # no mime time, no extension
directory = os.path.normpath(os.path.join(*path))
directory = directory[:200]
if not os.path.exists(directory):
os.makedirs(directory)
filename = name[:45-len(ext)] + ext
fullname = os.path.join(directory, filename)
collision = False
while os.path.exists(fullname):
collision = True
u = str(uuid.uuid4())[:8]
filename = name[:45-len(ext)] + '_R'+ u + ext
fullname = os.path.join(directory, filename)
return os.path.realpath(os.path.normpath(fullname)), collision
if __name__ == '__main__':
sys.exit(main(sys.argv))