feat: url agora construida usando a urllib.parse

okfn-brasil · Oct 1, 2024 · f371afa · f371afa
1 parent 19b0fb2
commit f371afa
Showing 1 changed file with 3 additions and 11 deletions.
diff --git a/data_collection/gazette/spiders/ma/ma_duque_bacelar.py b/data_collection/gazette/spiders/ma/ma_duque_bacelar.py
@@ -1,8 +1,8 @@
+import urllib.parse
 from datetime import date
 from typing import List
 
 import scrapy
-import scrapy.http
 
 from gazette.items import Gazette
 from gazette.spiders.base import BaseGazetteSpider
@@ -15,21 +15,13 @@ class MaDuqueBacelar(BaseGazetteSpider):
     start_urls = ["https://duquebacelar.ma.gov.br/transparencia/diario-oficial"]
     start_date = date(2019, 8, 29)
 
-    def start_requests(self):
-        for url in self.start_urls:
-            yield scrapy.Request(url=url, callback=self.parse)
-
     def _extract_url(self, url_element: scrapy.Selector):
         (raw_path,) = url_element.css("a")
 
-        path = (
-            raw_path.attrib.get("href")[1:]
-            if raw_path.attrib.get("href")[0] == "/"
-            else raw_path
+        return urllib.parse.urljoin(
+            "https://duquebacelar.ma.gov.br", raw_path.attrib.get("href")
         )
 
-        return f"https://duquebacelar.ma.gov.br/{path}"
-
     def _extract_date(self, date_element: scrapy.Selector):
         day, month, year = date_element.xpath("text()").get().split("/")