fix scrapy update bug

bucm-policy-search · Aug 23, 2021 · c0426c1 · c0426c1
1 parent a0e686f
commit c0426c1
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 61 deletions.
diff --git a/mail.py b/mail.py
diff --git a/start_crawl.py b/start_crawl.py
@@ -5,9 +5,7 @@
 from time import strftime
 from scrapy.crawler import CrawlerProcess
 from scrapy.utils.project import get_project_settings
-from dotenv import load_dotenv
 
-from mail import sendMail
 from webSpider.spiders.BATCM import BATCM
 from webSpider.spiders.NATCM import NATCM
 from webSpider.spiders.HCOHP import HCOHP
@@ -54,13 +52,6 @@ def job():
 
     process.start()
 
-    load_dotenv()
-
-    if ("SENDER" in os.environ) and ("RECEIVERS" in os.environ):
-        SENDER = os.environ["SENDER"]
-        RECEIVERS = os.environ.get["RECEIVERS"]
-        sendMail(sender=SENDER, receivers=RECEIVERS)
-
 
 if crawl_auto == "non":
     job()

diff --git a/webSpider/pipelines.py b/webSpider/pipelines.py
@@ -23,12 +23,11 @@ def connect_elasticsearch(self):
         load_dotenv()
         logging.debug("print config value: %s", os.environ)
 
-        USERNAME = os.environ.get("USERNAME", "changeme")
-        PASSWORD = os.environ.get("PASSWORD", "changeme")
-        URL = os.environ.get("URL", "http://localhost:9200")
+        USERNAME = os.environ.get("USERNAME", False)
+        PASSWORD = os.environ.get("PASSWORD", False)
+        URL = os.environ.get("URL", False)
 
-        if USERNAME == "changeme" and PASSWORD == "changeme":
-            # 如果真有人用这样的用户名和密码，那也不要连了, 2333
+        if not (USERNAME and PASSWORD and URL):
             self.es_connected = False
         else:
             # 详情参考官方文档 https://elasticsearch-py.readthedocs.io/en/7.x/
@@ -40,7 +39,7 @@ def connect_elasticsearch(self):
 
                 INDEX = os.environ.get("ES_INDEX", "changeme")
 
-                self.es.search(index="policy", filter_path=["hits.hits._id"])
+                self.es.search(index=INDEX, filter_path=["hits.total.value"])
             except Exception:
                 logging.error("Fail to connect ElasticSearch.")
                 self.es_connected = False
@@ -49,15 +48,15 @@ def open_spider(self, spider):
         self.connect_elasticsearch()
 
     def close_spider(self, spider):
-        pass
+        self.es.close()
 
     def process_item(self, item, spider):
 
         if self.es_connected:
 
             logging.debug("Processing items in pipelines: {}".format(item))
 
-            index = "policy"
+            index = os.environ["ES_INDEX"]
 
             logging.debug("publishingDate: " + item["publishingDate"])
 
@@ -80,11 +79,19 @@ def process_item(self, item, spider):
             insert_body = ItemAdapter(item).asdict()
             insert_body["@timestamp"] = strftime("%Y-%m-%dT%H:%M:%S%z")
 
-            if self.es.count(index=index, body=search_body)["count"] == 0:
+            result = self.es.search(
+                index=index,
+                body=search_body,
+                filter_path=["hits.hits._id", "hits.total.value"],
+            )
+
+            id = ""
+            count = result["hits"]["total"]["value"]
+
+            if count == 0:
                 self.es.create(index=index, body=insert_body, id=uuid.uuid1())
             else:
-                self.es.update_by_query(
-                    index=index, body=insert_body, conflicts="proceed"
-                )
+                id = result["hits"]["hits"][0]["_id"]
+                self.es.update(index=index, id=id, body={"doc": insert_body})
 
         return item
diff --git a/webSpider/settings.py b/webSpider/settings.py
@@ -57,20 +57,8 @@
 DOWNLOADER_MIDDLEWARES = {
     "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": None,
     "scrapy.downloadermiddlewares.retry.RetryMiddleware": None,
-    "scrapy_fake_useragent.middleware.RandomUserAgentMiddleware": 400,
-    "scrapy_fake_useragent.middleware.RetryUserAgentMiddleware": 401,
 }
 
-# https://github.com/alecxe/scrapy-fake-useragent
-FAKEUSERAGENT_PROVIDERS = [
-    # this is the first provider we'll try
-    "scrapy_fake_useragent.providers.FakeUserAgentProvider",
-    # if FakeUserAgentProvider fails, we'll use faker to generate a user-agent string for us
-    "scrapy_fake_useragent.providers.FakerProvider",
-    # fall back to USER_AGENT value
-    "scrapy_fake_useragent.providers.FixedUserAgentProvider",
-]
-
 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
 # EXTENSIONS = {