diff --git a/data_collection/gazette/spiders/base/instar.py b/data_collection/gazette/spiders/base/instar.py index 07b4a42407..5ce0ba2abf 100644 --- a/data_collection/gazette/spiders/base/instar.py +++ b/data_collection/gazette/spiders/base/instar.py @@ -8,6 +8,8 @@ class BaseInstarSpider(BaseGazetteSpider): + power = "executive_legislative" + def start_requests(self): page = 1 start_date = self.start_date.strftime("%d-%m-%Y") @@ -58,7 +60,7 @@ def parse(self, response, page, start_date, end_date): date=gazette_date, edition_number=edition_number, is_extra_edition=False, - power="executive_legislative", + power=self.power, ) yield scrapy.Request( diff --git a/data_collection/gazette/spiders/base/ptio.py b/data_collection/gazette/spiders/base/ptio.py index ee0e77d632..db74ea0f48 100644 --- a/data_collection/gazette/spiders/base/ptio.py +++ b/data_collection/gazette/spiders/base/ptio.py @@ -7,9 +7,9 @@ class BasePtioSpider(BaseGazetteSpider): def start_requests(self): - yield scrapy.Request(self.BASE_URL) + yield scrapy.Request(url=self.base_url, callback=self.ptio_parse) - def parse(self, response): + def ptio_parse(self, response): for gazette_div in response.xpath("//div[@class='edicoes']"): raw_gazete_date = gazette_div.xpath( ".//div[@class='data-caderno hidden-phone']/text()" @@ -41,4 +41,6 @@ def parse(self, response): "//ul[@class='paginacao']//a[@class='proximo']/@href" ) if next_page: - yield scrapy.Request(response.urljoin(next_page.get())) + yield scrapy.Request( + response.urljoin(next_page.get()), callback=self.ptio_parse + ) diff --git a/data_collection/gazette/spiders/rj/rj_areal.py b/data_collection/gazette/spiders/rj/rj_areal.py index 831fa16643..086d47713c 100644 --- a/data_collection/gazette/spiders/rj/rj_areal.py +++ b/data_collection/gazette/spiders/rj/rj_areal.py @@ -7,5 +7,5 @@ class RjArealSpider(BasePtioSpider): name = "rj_areal" TERRITORY_ID = "3300225" allowed_domains = ["portaldatransparencia.com.br"] - BASE_URL = "http://rj.portaldatransparencia.com.br/prefeitura/areal/" + base_url = "http://rj.portaldatransparencia.com.br/prefeitura/areal/" start_date = date(2006, 8, 1) diff --git a/data_collection/gazette/spiders/rj/rj_cabo_frio.py b/data_collection/gazette/spiders/rj/rj_cabo_frio.py index 4dd4a3888c..4f6c3438f0 100644 --- a/data_collection/gazette/spiders/rj/rj_cabo_frio.py +++ b/data_collection/gazette/spiders/rj/rj_cabo_frio.py @@ -1,11 +1,36 @@ -from datetime import date +from copy import copy +from datetime import date, timedelta +from gazette.spiders.base.instar import BaseInstarSpider from gazette.spiders.base.ptio import BasePtioSpider -class RjCaboFrioSpider(BasePtioSpider): +class RjCaboFrioSpider(BaseInstarSpider, BasePtioSpider): name = "rj_cabo_frio" TERRITORY_ID = "3300704" - allowed_domains = ["portaldatransparencia.com.br"] - BASE_URL = "http://rj.portaldatransparencia.com.br/prefeitura/cabofrio/" + allowed_domains = [ + "portaldatransparencia.com.br", + "cabofrio.instartecnologia.com.br", + ] + base_url = "https://www.cabofrio.instartecnologia.com.br/portal/diario-oficial" start_date = date(2020, 7, 29) + power = "executive" + + def start_requests(self): + ptio_url = "http://rj.portaldatransparencia.com.br/prefeitura/cabofrio/" + ptio_end_date = date(2024, 8, 31) + if self.start_date > ptio_end_date: + yield from BaseInstarSpider.start_requests(self) + else: + if self.end_date <= ptio_end_date: + self.base_url = ptio_url + yield from BasePtioSpider.start_requests(self) + else: + ptio = copy(self) + ptio.end_date = ptio_end_date + ptio.base_url = ptio_url + yield from BasePtioSpider.start_requests(ptio) + + instar = copy(self) + instar.start_date = ptio_end_date + timedelta(days=1) + yield from BaseInstarSpider.start_requests(instar) diff --git a/data_collection/gazette/spiders/rj/rj_comendador_levy_gasparian.py b/data_collection/gazette/spiders/rj/rj_comendador_levy_gasparian.py index b3cf2fddc5..9f4e4414ec 100644 --- a/data_collection/gazette/spiders/rj/rj_comendador_levy_gasparian.py +++ b/data_collection/gazette/spiders/rj/rj_comendador_levy_gasparian.py @@ -7,7 +7,7 @@ class RjComendadorLevyGasparianSpider(BasePtioSpider): name = "rj_comendador_levy_gasparian" TERRITORY_ID = "3300951" allowed_domains = ["portaldatransparencia.com.br"] - BASE_URL = ( + base_url = ( "http://rj.portaldatransparencia.com.br/prefeitura/comendadorlevygasparian/" ) start_date = date(2013, 11, 26) diff --git a/data_collection/gazette/spiders/rj/rj_sapucaia.py b/data_collection/gazette/spiders/rj/rj_sapucaia.py index 9959c28fce..04873583da 100644 --- a/data_collection/gazette/spiders/rj/rj_sapucaia.py +++ b/data_collection/gazette/spiders/rj/rj_sapucaia.py @@ -7,5 +7,5 @@ class RjSapucaiaSpider(BasePtioSpider): name = "rj_sapucaia" TERRITORY_ID = "3305406" allowed_domains = ["portaldatransparencia.com.br"] - BASE_URL = "http://rj.portaldatransparencia.com.br/prefeitura/sapucaia/" + base_url = "http://rj.portaldatransparencia.com.br/prefeitura/sapucaia/" start_date = date(2019, 1, 16)