Skip to content

Commit

Permalink
feat: url agora construida usando a urllib.parse
Browse files Browse the repository at this point in the history
  • Loading branch information
ivanzigoni committed Oct 1, 2024
1 parent 19b0fb2 commit f371afa
Showing 1 changed file with 3 additions and 11 deletions.
14 changes: 3 additions & 11 deletions data_collection/gazette/spiders/ma/ma_duque_bacelar.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import urllib.parse
from datetime import date
from typing import List

import scrapy
import scrapy.http

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider
Expand All @@ -15,21 +15,13 @@ class MaDuqueBacelar(BaseGazetteSpider):
start_urls = ["https://duquebacelar.ma.gov.br/transparencia/diario-oficial"]
start_date = date(2019, 8, 29)

def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url, callback=self.parse)

def _extract_url(self, url_element: scrapy.Selector):
(raw_path,) = url_element.css("a")

path = (
raw_path.attrib.get("href")[1:]
if raw_path.attrib.get("href")[0] == "/"
else raw_path
return urllib.parse.urljoin(
"https://duquebacelar.ma.gov.br", raw_path.attrib.get("href")
)

return f"https://duquebacelar.ma.gov.br/{path}"

def _extract_date(self, date_element: scrapy.Selector):
day, month, year = date_element.xpath("text()").get().split("/")

Expand Down

0 comments on commit f371afa

Please sign in to comment.