forked from jphcoi/crawtext
-
Notifications
You must be signed in to change notification settings - Fork 2
/
crawl_trial.py
144 lines (118 loc) · 3.5 KB
/
crawl_trial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#
import os
import sys
import seachengine2
import warnings
import zipfile
import yaml
from library import *
from library_exp import *
from pattern import web
# tweak textmate/python bug on utf8
reload(sys)
sys.setdefaultencoding("utf-8")
#config Seeks Search
seeks_search = 1 # 0 is off / 1 is on
nb_results = 1 # Set the number of results, Seeks yields usually less than asked
def unzip_file_into_dir(file, dir):
try:
os.mkdir(dir, 0777)
except:
pass
zfobj = zipfile.ZipFile(file)
for name in zfobj.namelist():
if name.endswith('/'):
try:
os.mkdir(os.path.join(dir, name))
except:
pass
else:
outfile = open(os.path.join(dir, name), 'wb')
outfile.write(zfobj.read(name))
outfile.close()
print "Option set: Never print matching warnings"
warnings.filterwarnings("ignore")
global pages
pages={}
global pattern_date_fr
# try:
# os.mkdir('data')
# except:
# pass
try:
print "Trying to read parameters from commandline..."
user_parameters=sys.argv[1]
except:
print "No commandline arguments. Reading parameters from crawl_parameters.yml..."
user_parameters='crawl_parameters.yml'
parameters = yaml.load('\n'.join(open(user_parameters,'r').readlines()))
print 'Loaded parameters: ', parameters
try:
path = parameters['path']
except:
print 'invalid parameters file'
path=parameters['corpus_file']
print 'Path: ', path
inlinks_min=int(parameters.get('inlinks_min',1))
print "Minimum number of inlinks (default is 1): ", inlinks_min
depth=int(parameters.get('depth',10))
print "Crawl depth (default is 10): ", depth
if parameters.has_key('query'):
query=parameters.get('query')
print "Query: ", query
else:
sys.exit("You need to enter a query, otherwise...")
result_path=parameters.get('result_path','output')
print "Crawl output path (default is ./output/): ", result_path
max_pages_number=int(parameters.get('max_pages_number',10000))
if max_pages_number == 999999:
pass
else:
max_pages_number=min(max_pages_number,100000)
print "Maximum number of pages (default is 10000): ", max_pages_number
if path[-4:]=='.zip':
print 'unzipping' + path + '...'
corpus_out = '/'.join(path.split('/')[:-1]) + '/'+query
print corpus_out
unzip_file_into_dir(path,corpus_out)
path=corpus_out
print 'Path: ',path
if seeks_search == 1:
print "Seeks search enabled. Creating Seeks file in %s" % path
make_seeds(query,path,nb_results=nb_results)
dirList=os.listdir(path)
print 'List of files in path: ',dirList
for fname in dirList[:]:
pagelist =os.path.join(path,fname)
try:
url=web.URL(pagelist)
chaine=url.download(cached=False)
new_urls = map(lambda x: url_uniformer(x.split('">')[0]),web.find_urls(chaine, unique=True))
if 'Google Search' in pagelist:
new_urls = map(lambda x:x.split("&")[0],new_urls)
for new_url in new_urls[:]:
print "Checking for forbidden URL..."
if not check_forbidden((new_url,'')) and not new_url in pages:
pages[new_url]=inlinks_min
except:
pass
print 'Pages init: ', len(pages)
print 'Pages: ', pages
print "Naming database..."
db_name=os.path.join(result_path,query+'_crawl.db')
try:
os.mkdir(result_path)
os.remove(os.path.join(result_path,query+'_crawl.db'))
print 'Deleted: ', result_path+query+'_crawl.db'
except:
pass
crawler=seachengine2.crawler(db_name)
try:
crawler.createindextables()
except:
print "Tables already exist, good."
crawler.crawl(pages,query=query,inlinks=inlinks_min,depth=depth,max_pages_number=max_pages_number,citations_whole=0)
exportcrawl2resolu(db_name,query,result_path)