From 1b734d33c48447089045d922df0c89c24801786f Mon Sep 17 00:00:00 2001 From: Tales Mota Date: Thu, 9 May 2024 11:04:55 -0300 Subject: [PATCH] feat: create date filter for start_date and end_date --- .../gazette/spiders/pr/pr_guaratuba.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/data_collection/gazette/spiders/pr/pr_guaratuba.py b/data_collection/gazette/spiders/pr/pr_guaratuba.py index bdd0575f0..1fc9119a9 100644 --- a/data_collection/gazette/spiders/pr/pr_guaratuba.py +++ b/data_collection/gazette/spiders/pr/pr_guaratuba.py @@ -41,8 +41,12 @@ def _extract_label_edition_number(self, link_selector): def parse(self, response): map_date_edition = dict() for line in response.css(".table.table-striped tr"): - td = line.css("td") - if len(td) == 0 or td.get() and td.get().find("Data da Publicação") != -1: + td_selector = line.css("td") + if ( + len(td_selector) == 0 + or td_selector.get() + and td_selector.get().find("Data da Publicação") != -1 + ): continue else: _gazzete_date = dateparser.parse( @@ -51,13 +55,16 @@ def parse(self, response): gazzete_date = _gazzete_date.date() pdf_links = [] edition_number, is_extra_edition = None, False - for _a in line.css("td")[1].css("a"): - pdf_relative_link = _a.css("::attr(href)").get() + for link_selector in line.css("td")[1].css("a"): + pdf_relative_link = link_selector.css("::attr(href)").get() pdf_links.append(self.base_url + pdf_relative_link) - label = _a.css("::text").get().strip() + + label = link_selector.css("::text").get().strip() if not label: - label, _edition_number = self._extract_label_edition_number(_a) + label, _edition_number = self._extract_label_edition_number( + link_selector + ) else: _edition_number = self._parse_edition_number(label)