Skip to content
This repository has been archived by the owner on Mar 10, 2023. It is now read-only.

Commit

Permalink
fix json double quotes problem in BATCM
Browse files Browse the repository at this point in the history
  • Loading branch information
gricn committed Aug 27, 2021
1 parent 80a506d commit bef7c70
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 4 deletions.
2 changes: 0 additions & 2 deletions webSpider/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,6 @@ def process_item(self, item, spider):
"query": {
"bool": {
"must": [
{"term": {"title.keyword": {"value": item["title"]}}},
{"match": {"publishingDate": item["publishingDate"]}},
{
"term": {
"urlSource.keyword": {"value": item["urlSource"]}
Expand Down
5 changes: 3 additions & 2 deletions webSpider/spiders/BATCM.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def detailPage(self, response):

# delete "\n" and spaces in title
title_new = title_origin.strip(" \n")
item["title"] = re.sub(r"\s", "", title_new)
item["title"] = re.sub(r'"', "\u0022", title_new)

date_origin = response.css("div.zhengwen div::text").get()

Expand All @@ -156,9 +156,10 @@ def detailPage(self, response):
if ul != []:
for li in ul:
mark = li.css("a::text").get()
new_mark = re.sub(r'"', "\u0022", mark)

link = response.urljoin(li.css("a::attr(href)").get())
attachment.append({"mark": mark, "link": link})
attachment.append({"mark": new_mark, "link": link})
item["attachment"] = attachment

yield item

0 comments on commit bef7c70

Please sign in to comment.