Skip to content

Commit

Permalink
Fix buf for Google safe mode; Fix bug when create directories accrodi…
Browse files Browse the repository at this point in the history
…ng to keywords;
  • Loading branch information
sczhengyabin committed Dec 2, 2016
1 parent c214834 commit c5b2cd6
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 7 deletions.
4 changes: 2 additions & 2 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@


def google_gen_query_url(keywords, face_only=False, safe_mode=False):
base_url = "https://www.google.com/search?tbm=isch"
base_url = "https://www.google.com/search?tbm=isch&hl=en"
keywords_str = "&q=" + quote(keywords)
query_url = base_url + keywords_str
if face_only is True:
Expand All @@ -42,7 +42,7 @@ def google_gen_query_url(keywords, face_only=False, safe_mode=False):


def google_image_url_from_webpage(driver):
time.sleep(10)
# time.sleep(10)
image_elements = driver.find_elements_by_class_name("rg_l")
image_urls = list()
url_pattern = "imgurl=\S*&imgrefurl"
Expand Down
9 changes: 7 additions & 2 deletions downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,12 @@


headers = {
'Connection': 'close',
'User-Agent': 'Chrome/54.0.2840.100'
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Proxy-Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
"Accept-Encoding": "gzip, deflate, sdch",
# 'Connection': 'close',
}


Expand Down Expand Up @@ -59,6 +63,7 @@ def download_images(image_urls, dst_dir, file_prefix="img", concurrency=50, time
:param concurrency: number of requests process simultaneously
:return: none
"""

with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:
futures = list()
count = 0
Expand Down
14 changes: 13 additions & 1 deletion mainwindow.ui
Original file line number Diff line number Diff line change
Expand Up @@ -737,6 +737,9 @@ per keywords</string>
<property name="text">
<string>HTTP</string>
</property>
<property name="checked">
<bool>true</bool>
</property>
<attribute name="buttonGroup">
<string notr="true">buttonGroup_2</string>
</attribute>
Expand All @@ -762,7 +765,7 @@ per keywords</string>
<string>Socks5</string>
</property>
<property name="checked">
<bool>true</bool>
<bool>false</bool>
</property>
<attribute name="buttonGroup">
<string notr="true">buttonGroup_2</string>
Expand All @@ -776,6 +779,15 @@ per keywords</string>
<pointsize>12</pointsize>
</font>
</property>
<property name="toolTip">
<string>input ip:port</string>
</property>
<property name="statusTip">
<string>xxx.xxx.xxx.xx:port</string>
</property>
<property name="placeholderText">
<string>xxx.xxx.xxx.xx:port</string>
</property>
</widget>
</item>
</layout>
Expand Down
6 changes: 5 additions & 1 deletion ui_mainwindow.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,7 @@ def setupUi(self, MainWindow):
font.setPointSize(12)
self.radioButton_http.setFont(font)
self.radioButton_http.setFocusPolicy(QtCore.Qt.TabFocus)
self.radioButton_http.setChecked(True)
self.radioButton_http.setObjectName(_fromUtf8("radioButton_http"))
self.buttonGroup_2 = QtGui.QButtonGroup(MainWindow)
self.buttonGroup_2.setObjectName(_fromUtf8("buttonGroup_2"))
Expand All @@ -431,7 +432,7 @@ def setupUi(self, MainWindow):
font.setPointSize(12)
self.radioButton_socks5.setFont(font)
self.radioButton_socks5.setFocusPolicy(QtCore.Qt.TabFocus)
self.radioButton_socks5.setChecked(True)
self.radioButton_socks5.setChecked(False)
self.radioButton_socks5.setObjectName(_fromUtf8("radioButton_socks5"))
self.buttonGroup_2.addButton(self.radioButton_socks5)
self.horizontalLayout_3.addWidget(self.radioButton_socks5)
Expand Down Expand Up @@ -557,6 +558,9 @@ def retranslateUi(self, MainWindow):
self.checkBox_proxy.setText(_translate("MainWindow", "&Proxy:", None))
self.radioButton_http.setText(_translate("MainWindow", "HTTP", None))
self.radioButton_socks5.setText(_translate("MainWindow", "Socks5", None))
self.lineEdit_proxy.setToolTip(_translate("MainWindow", "input ip:port", None))
self.lineEdit_proxy.setStatusTip(_translate("MainWindow", "xxx.xxx.xxx.xx:port", None))
self.lineEdit_proxy.setPlaceholderText(_translate("MainWindow", "xxx.xxx.xxx.xx:port", None))
self.menuAbout.setTitle(_translate("MainWindow", "Help", None))
self.actionAbout.setText(_translate("MainWindow", "About", None))

8 changes: 7 additions & 1 deletion utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# -*- coding: utf-8 -*-


def gen_valid_dir_name_for_keywords(keywords):
keep = ["-", "_", "."]
keywords = keywords.replace(" ", "_").replace(":", "-")
return "".join(c for c in keywords if c.isalnum() or c in keep).rstrip()


class AppConfig(object):
def __init__(self):
self.engine = "Google"
Expand Down Expand Up @@ -29,7 +35,7 @@ def to_command_paras(self):

str_paras += ' -j ' + str(self.num_threads)

str_paras += ' -o "' + self.output_dir + '/' + self.keywords + '"'
str_paras += ' -o "' + self.output_dir + '/' + gen_valid_dir_name_for_keywords(self.keywords) + '"'

if self.face_only:
str_paras += ' -F '
Expand Down

0 comments on commit c5b2cd6

Please sign in to comment.