-
Notifications
You must be signed in to change notification settings - Fork 3
/
filter_comments_v2.py
114 lines (100 loc) · 4.71 KB
/
filter_comments_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from timestamp import get_now_timestamp
from sqlalchemy.sql import and_, asc, desc, or_
from sqlalchemy import Column, String, create_engine, Integer, SmallInteger
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.dialects import mysql
from sqlalchemy import text
from model import Comment, Site
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
import re
import os
import requests
sqlconn = 'mysql+pymysql://root:1101syw@localhost:3306/test?charset=utf8mb4'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
proxy = '127.0.0.1:1080'
proxies = {
'http': 'http://' + proxy,
'https': 'http://' + proxy
}
def judge_comment(comment, broswer, session):
links = re.findall(r'((?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6})+(?:(?:\/[=\w\?]+)*))+',
comment.content)
print(comment.id, comment.user_link[-24:], links)
cnt = 0
for link in links:
try:
res = requests.get('http://' + link, headers=headers, timeout=8, proxies=proxies)
print("Visiting Site: http://%s" % link)
print("Status Code: %s" % res.status_code)
if res.status_code != 200:
print("响应失败")
print("-----------------------------")
else:
browser.get('http://' + link)
print("响应成功")
if browser.title:
title = browser.title
else:
title = ''
try:
screenshot = '.\\screenshots\\' + str(comment.id) + '_' + str(cnt) + '.png'
if not os.path.exists(screenshot):
browser.save_screenshot(screenshot)
print("截图成功")
else:
print("截图已存在")
except BaseException as err_msg:
print("截图失败:%s" % err_msg)
print("-----------------------------")
# 判断弹窗
# TODO: 为了节省时间,目前不对所有url进行弹窗判断,因为已经知道哪些有弹窗
if comment.id == 20568:
WebDriverWait(browser, 20, 0.5).until(EC.alert_is_present()) # 最大等待时间20s,每0.5s检测一次元素,只要检测到即可进行下一步操作
update_status = browser.switch_to.alert.text
print(update_status)
browser.switch_to.alert.accept() # 点击弹出框的确定按钮
land_page = browser.current_url
site = Site()
site.user_id = comment.user_link[-24:]
site.comment_id = comment.id
site.land_page = land_page
site.url = link
site.page_title = title
site.screenshot = './screenshots/' + str(comment.id) + '_' + str(cnt) + '.png'
site.type = 2
site.detail = ''
site.create_time = get_now_timestamp()
cnt = cnt + 1
# 插入数据库
# 插入前判断是否存在 sqlalchemy不会没有直接穿整个对象更新的方法吧不会吧不会吧
rows = session.query(Site).filter(Site.screenshot.like(site.screenshot)).all()
if not rows:
session.add(site)
session.commit()
# 修改该条comment 算了先不改了
# res = session.query(Comment).filter(Comment.id == comment.id).update({"type": 3})
except Exception as e:
print("Err: ", e)
if __name__ == '__main__':
engine = create_engine(sqlconn, echo=True, max_overflow=8)
DBSession = sessionmaker(bind=engine)
session = DBSession()
comments = session.query(Comment).filter(Comment.type == 2, and_(
Comment.content.op('regexp')(r'([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}'))).all()
option = webdriver.ChromeOptions()
option.add_argument('--headless')
option.add_argument("--window-size=1920,1050") # 专门应对无头浏览器中不能最大化屏幕的方案
browser = webdriver.Chrome(chrome_options=option)
print("len: ", len(comments))
for comment in comments:
judge_comment(comment, browser, session)
browser.quit()
session.close()