From 80a506d4645fbb45e0b127b3bc2db03b6bf40eee Mon Sep 17 00:00:00 2001 From: gricn Date: Fri, 27 Aug 2021 18:43:58 +0800 Subject: [PATCH] fix NATCM's bug --- webSpider/spiders/NATCM.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/webSpider/spiders/NATCM.py b/webSpider/spiders/NATCM.py index bb10f4d..3af949b 100644 --- a/webSpider/spiders/NATCM.py +++ b/webSpider/spiders/NATCM.py @@ -125,11 +125,9 @@ def detailPage(self, response): # change "时间:2020-12-10 15:40:19" to "2020-12-10" item["publishingDate"] = re.search("(?<=:)\S*", date_origin).group(0) - item["source"] = response.xpath( - "//td[@valign]/table[2]//td/span/p[last()]/text()" - ).get() + item["source"] = "国家中医药管理局" - article = "".join(response.xpath("//td[@valign]/table[2]//td").getall()) + article = "".join(response.xpath("//td[@valign]/table[2]//td/span/p").getall()) item["article"] = article item["plaintext"] = re.sub(r"\s(\s)+", " ", remove_tags(article))