diff --git a/mail.py b/mail.py deleted file mode 100644 index ea0384d..0000000 --- a/mail.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/python -# -*- coding: UTF-8 -*- - -import smtplib -import os -import logging -from email.mime.text import MIMEText -from email.header import Header -from time import strftime - -current_time = strftime("%Y-%m-%dT%H:%M:%S%z") -subject = f"{current_time} scrapy result" - - -def sendMail(sender, receivers, subject=subject): - - # 三个参数:第一个为文本内容,第二个 plain 设置文本格式,第三个 utf-8 设置编码 - message = MIMEText("爬虫完成", "plain", "utf-8") - message["From"] = Header(sender, "utf-8") # 发送者 - message["To"] = Header(receivers.as_string(), "utf-8") # 接收者 - message["Subject"] = Header(subject, "utf-8") - - try: - smtpObj = smtplib.SMTP("localhost") - smtpObj.sendmail(sender, receivers, message.as_string()) - print("邮件发送成功") - except smtplib.SMTPException: - print("Error: 无法发送邮件") diff --git a/start_crawl.py b/start_crawl.py index d1b977a..a846252 100644 --- a/start_crawl.py +++ b/start_crawl.py @@ -5,9 +5,7 @@ from time import strftime from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings -from dotenv import load_dotenv -from mail import sendMail from webSpider.spiders.BATCM import BATCM from webSpider.spiders.NATCM import NATCM from webSpider.spiders.HCOHP import HCOHP @@ -54,13 +52,6 @@ def job(): process.start() - load_dotenv() - - if ("SENDER" in os.environ) and ("RECEIVERS" in os.environ): - SENDER = os.environ["SENDER"] - RECEIVERS = os.environ.get["RECEIVERS"] - sendMail(sender=SENDER, receivers=RECEIVERS) - if crawl_auto == "non": job() diff --git a/webSpider/pipelines.py b/webSpider/pipelines.py index b24f3de..6665ba5 100644 --- a/webSpider/pipelines.py +++ b/webSpider/pipelines.py @@ -23,12 +23,11 @@ def connect_elasticsearch(self): load_dotenv() logging.debug("print config value: %s", os.environ) - USERNAME = os.environ.get("USERNAME", "changeme") - PASSWORD = os.environ.get("PASSWORD", "changeme") - URL = os.environ.get("URL", "http://localhost:9200") + USERNAME = os.environ.get("USERNAME", False) + PASSWORD = os.environ.get("PASSWORD", False) + URL = os.environ.get("URL", False) - if USERNAME == "changeme" and PASSWORD == "changeme": - # 如果真有人用这样的用户名和密码,那也不要连了, 2333 + if not (USERNAME and PASSWORD and URL): self.es_connected = False else: # 详情参考官方文档 https://elasticsearch-py.readthedocs.io/en/7.x/ @@ -40,7 +39,7 @@ def connect_elasticsearch(self): INDEX = os.environ.get("ES_INDEX", "changeme") - self.es.search(index="policy", filter_path=["hits.hits._id"]) + self.es.search(index=INDEX, filter_path=["hits.total.value"]) except Exception: logging.error("Fail to connect ElasticSearch.") self.es_connected = False @@ -49,7 +48,7 @@ def open_spider(self, spider): self.connect_elasticsearch() def close_spider(self, spider): - pass + self.es.close() def process_item(self, item, spider): @@ -57,7 +56,7 @@ def process_item(self, item, spider): logging.debug("Processing items in pipelines: {}".format(item)) - index = "policy" + index = os.environ["ES_INDEX"] logging.debug("publishingDate: " + item["publishingDate"]) @@ -80,11 +79,19 @@ def process_item(self, item, spider): insert_body = ItemAdapter(item).asdict() insert_body["@timestamp"] = strftime("%Y-%m-%dT%H:%M:%S%z") - if self.es.count(index=index, body=search_body)["count"] == 0: + result = self.es.search( + index=index, + body=search_body, + filter_path=["hits.hits._id", "hits.total.value"], + ) + + id = "" + count = result["hits"]["total"]["value"] + + if count == 0: self.es.create(index=index, body=insert_body, id=uuid.uuid1()) else: - self.es.update_by_query( - index=index, body=insert_body, conflicts="proceed" - ) + id = result["hits"]["hits"][0]["_id"] + self.es.update(index=index, id=id, body={"doc": insert_body}) return item diff --git a/webSpider/settings.py b/webSpider/settings.py index 554a4ce..4ddc70e 100644 --- a/webSpider/settings.py +++ b/webSpider/settings.py @@ -57,20 +57,8 @@ DOWNLOADER_MIDDLEWARES = { "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": None, "scrapy.downloadermiddlewares.retry.RetryMiddleware": None, - "scrapy_fake_useragent.middleware.RandomUserAgentMiddleware": 400, - "scrapy_fake_useragent.middleware.RetryUserAgentMiddleware": 401, } -# https://github.com/alecxe/scrapy-fake-useragent -FAKEUSERAGENT_PROVIDERS = [ - # this is the first provider we'll try - "scrapy_fake_useragent.providers.FakeUserAgentProvider", - # if FakeUserAgentProvider fails, we'll use faker to generate a user-agent string for us - "scrapy_fake_useragent.providers.FakerProvider", - # fall back to USER_AGENT value - "scrapy_fake_useragent.providers.FixedUserAgentProvider", -] - # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html # EXTENSIONS = {