Skip to content
This repository has been archived by the owner on Mar 10, 2023. It is now read-only.

Commit

Permalink
fix scrapy update bug
Browse files Browse the repository at this point in the history
  • Loading branch information
gricn committed Aug 23, 2021
1 parent a0e686f commit c0426c1
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 61 deletions.
28 changes: 0 additions & 28 deletions mail.py

This file was deleted.

9 changes: 0 additions & 9 deletions start_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
from time import strftime
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from dotenv import load_dotenv

from mail import sendMail
from webSpider.spiders.BATCM import BATCM
from webSpider.spiders.NATCM import NATCM
from webSpider.spiders.HCOHP import HCOHP
Expand Down Expand Up @@ -54,13 +52,6 @@ def job():

process.start()

load_dotenv()

if ("SENDER" in os.environ) and ("RECEIVERS" in os.environ):
SENDER = os.environ["SENDER"]
RECEIVERS = os.environ.get["RECEIVERS"]
sendMail(sender=SENDER, receivers=RECEIVERS)


if crawl_auto == "non":
job()
Expand Down
31 changes: 19 additions & 12 deletions webSpider/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,11 @@ def connect_elasticsearch(self):
load_dotenv()
logging.debug("print config value: %s", os.environ)

USERNAME = os.environ.get("USERNAME", "changeme")
PASSWORD = os.environ.get("PASSWORD", "changeme")
URL = os.environ.get("URL", "http://localhost:9200")
USERNAME = os.environ.get("USERNAME", False)
PASSWORD = os.environ.get("PASSWORD", False)
URL = os.environ.get("URL", False)

if USERNAME == "changeme" and PASSWORD == "changeme":
# 如果真有人用这样的用户名和密码,那也不要连了, 2333
if not (USERNAME and PASSWORD and URL):
self.es_connected = False
else:
# 详情参考官方文档 https://elasticsearch-py.readthedocs.io/en/7.x/
Expand All @@ -40,7 +39,7 @@ def connect_elasticsearch(self):

INDEX = os.environ.get("ES_INDEX", "changeme")

self.es.search(index="policy", filter_path=["hits.hits._id"])
self.es.search(index=INDEX, filter_path=["hits.total.value"])
except Exception:
logging.error("Fail to connect ElasticSearch.")
self.es_connected = False
Expand All @@ -49,15 +48,15 @@ def open_spider(self, spider):
self.connect_elasticsearch()

def close_spider(self, spider):
pass
self.es.close()

def process_item(self, item, spider):

if self.es_connected:

logging.debug("Processing items in pipelines: {}".format(item))

index = "policy"
index = os.environ["ES_INDEX"]

logging.debug("publishingDate: " + item["publishingDate"])

Expand All @@ -80,11 +79,19 @@ def process_item(self, item, spider):
insert_body = ItemAdapter(item).asdict()
insert_body["@timestamp"] = strftime("%Y-%m-%dT%H:%M:%S%z")

if self.es.count(index=index, body=search_body)["count"] == 0:
result = self.es.search(
index=index,
body=search_body,
filter_path=["hits.hits._id", "hits.total.value"],
)

id = ""
count = result["hits"]["total"]["value"]

if count == 0:
self.es.create(index=index, body=insert_body, id=uuid.uuid1())
else:
self.es.update_by_query(
index=index, body=insert_body, conflicts="proceed"
)
id = result["hits"]["hits"][0]["_id"]
self.es.update(index=index, id=id, body={"doc": insert_body})

return item
12 changes: 0 additions & 12 deletions webSpider/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,20 +57,8 @@
DOWNLOADER_MIDDLEWARES = {
"scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": None,
"scrapy.downloadermiddlewares.retry.RetryMiddleware": None,
"scrapy_fake_useragent.middleware.RandomUserAgentMiddleware": 400,
"scrapy_fake_useragent.middleware.RetryUserAgentMiddleware": 401,
}

# https://github.com/alecxe/scrapy-fake-useragent
FAKEUSERAGENT_PROVIDERS = [
# this is the first provider we'll try
"scrapy_fake_useragent.providers.FakeUserAgentProvider",
# if FakeUserAgentProvider fails, we'll use faker to generate a user-agent string for us
"scrapy_fake_useragent.providers.FakerProvider",
# fall back to USER_AGENT value
"scrapy_fake_useragent.providers.FixedUserAgentProvider",
]

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
Expand Down

0 comments on commit c0426c1

Please sign in to comment.