-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommoncrawldltool.py
94 lines (70 loc) · 2.59 KB
/
commoncrawldltool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python2
#Ryan Kantor
#Fall 2022
import sys
import os
import requests
import gzip
import warc
from clint.textui import progress
def queue_fromfile(indexfile):
if not os.path.exists(".\indexes"):
os.mkdir(".\indexes")
indices = gzip.open(indexfile)
line = indices.readline()
while len(line) > 0:
dl_file("https://data.commoncrawl.org/" + line.strip(), os.path.join(".\indexes", line.split('/')[-1].strip()))
line = indices.readline()
queue_index(".\indexes")
def queue_index(indexdir):
for x in os.listdir(indexdir):
print("Searching for PDFs in "+indexdir+"\\"+x+"...")
find_pdfs(os.path.join(indexdir, x))
def queue_paths(pathsgz):
if not os.path.exists(".\warcs"):
os.mkdir(".\warcs")
with gzip.open(pathsgz) as fp:
for count, line in enumerate(fp):
pass
i = 1
pathfile = gzip.open(pathsgz)
line = pathfile.readline()
while len(line) > 0:
print("Downloading WARC file " + str(i) + " of " + str(count + 1))
i += 1
warcgz = dl_file("https://data.commoncrawl.org/" + line.strip(), os.path.join(".\warcs", line.split('/')[-1].strip()))
find_pdfs(warcgz)
line = pathfile.readline()
def find_pdfs(indices):
if not os.path.exists(".\warcs"):
os.mkdir(".\warcs")
indfile = gzip.open(indices)
line = indfile.readline()
while len(line) > 0:
if 'pdf' in line:
for i in line.split():
if 'crawl-data' in i:
print("Downloading "+i[1:-2].split('/')[-1])
dl_file("https://data.commoncrawl.org/" + i[1:-2], os.path.join(".\warcs", i[1:-2].split('/')[-1]))
line = indfile.readline()
def find_pdfs_warc(warcgzfile):
for record in warc.open(warcgzfile):
print(record['Content-Type'])
if 'pdf' in record['Content-Type']:
print("found pdf")
def dl_file(dest, filename):
newpath = '\\'.join(filename.split('\\')[:-1])
if not os.path.exists(newpath):
os.mkdir(newpath)
r = requests.get(dest, stream=True)
with open(filename, 'wb') as f:
total_length = int(r.headers.get('content-length'))
for chunk in progress.bar(r.iter_content(chunk_size=1024), expected_size=(total_length/1024) + 1):
if chunk:
f.write(chunk)
f.flush()
return filename
def main():
queue_fromfile(sys.argv[1])
if __name__=="__main__":
main()