Merge pull request #68 from comsysto/fix/FRGOV-pipeline-not-working

Fix FR governmental data pipeline
comsysto · Feb 2, 2024 · 1581c53 · 1581c53
2 parents 6372095 + 51b0c2d
commit 1581c53
Show file tree

Hide file tree

Showing 7 changed files with 59 additions and 39 deletions.
diff --git a/.github/workflows/check-datasources.yml b/.github/workflows/check-datasources.yml
@@ -24,13 +24,12 @@ jobs:
  run: |
  python -m pip install --upgrade pip
  pip install -r requirements.txt
+ pip install -r test/requirements.txt
 
- # - name: Run integration tests (only)
- # run: |
- # pip install -r test/requirements.txt
- # pytest -m "integration_test"
-
- - name: "[DE/BNA] Real data validity checks"
+ - name: "[DE/BNA] Data retrieval check"
  run: |
- pip install -r test/requirements.txt
  pytest tests/integration/test_int_de_bna.py
+
+ - name: "[FR] Data retrieval check"
+ run: |
+ pytest tests/integration/test_int_fr_france.py
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -45,4 +45,4 @@ jobs:
  - name: Run tests
  run: |
  pip install -r test/requirements.txt
- pytest
+ pytest -m "not check_datasource"
diff --git a/charging_stations_pipelines/pipelines/fr/france.py b/charging_stations_pipelines/pipelines/fr/france.py
@@ -31,12 +31,7 @@ def _retrieve_data(self):
  if self.online:
  logger.info("Retrieving Online Data")
  self.download_france_gov_file(tmp_data_path)
- self.data = pd.read_csv(
- os.path.join(data_dir, "france_stations.csv"),
- delimiter=",",
- encoding="utf-8",
- encoding_errors="replace",
- )
+ self.data = self.load_csv_file(tmp_data_path)
 
  def run(self):
  logger.info("Running FR GOV Pipeline...")
@@ -55,7 +50,7 @@ def run(self):
  @staticmethod
  def download_france_gov_file(target_file):
  """Download a file from the French government website."""
- base_url = "https://transport.data.gouv.fr/resources/79624"
+ base_url = "https://transport.data.gouv.fr/resources/81548"
 
  r = requests.get(base_url, headers={"User-Agent": "Mozilla/5.0"})
  soup = BeautifulSoup(r.content, "html.parser")
@@ -73,3 +68,13 @@ def download_france_gov_file(target_file):
  "Could not determine source for french government data",
  )
  download_file(link_to_dataset[0]["href"], target_file)
+
+ @staticmethod
+ def load_csv_file(target_file):
+ return pd.read_csv(
+ target_file,
+ delimiter=",",
+ encoding="utf-8",
+ encoding_errors="replace",
+ low_memory=False,
+ )
diff --git a/charging_stations_pipelines/pipelines/fr/france_mapper.py b/charging_stations_pipelines/pipelines/fr/france_mapper.py
@@ -41,15 +41,11 @@ def map_station_fra(row: pd.Series) -> Station:
  float(check_coordinates(row.get("consolidated_latitude"))),
  )
  )
- station.date_created = row.get("date_mise_en_service").strptime("%Y-%m-%d")
- station.date_updated = row.get("date_maj").strptime("%Y-%m-%d")
 
  if not pd.isna(row.get("date_mise_en_service")):
  station.date_created = datetime.strptime(row.get("date_mise_en_service"), "%Y-%m-%d")
  if not pd.isna(row.get("date_maj")):
  station.date_updated = datetime.strptime(row.get("date_maj"), "%Y-%m-%d")
- else:
- station.date_updated = datetime.now
 
  return station
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -27,5 +27,6 @@ testpaths = [
 
 # Declare custom markers
 markers = [
- "integration_test: marks tests as integration tests, which are bit slow (deselect with '-m \"integration_test\"')",
+ "integration_test: marks tests as integration tests (deselect with '-m \"not integration_test\"')",
+ "check_datasource: marks tests as datasource check for scheduled github action (deselect with '-m \"not check_datasource\"')",
 ]
diff --git a/tests/integration/test_int_de_bna.py b/tests/integration/test_int_de_bna.py
@@ -58,25 +58,16 @@ def bna_data():
 
 
 @pytest.mark.integration_test
+@pytest.mark.check_datasource
 def test_file_size(bna_data):
  bna_file_name, _ = bna_data
  # Check file size of the downloaded file
- assert os.path.getsize(bna_file_name) >= 8_602_458 # ~ 9 MB
+ assert os.path.getsize(bna_file_name) >= 1_000 # actual file is ~ 9 MB, just make sure it is not quasi empty here
 
 
 @pytest.mark.integration_test
+@pytest.mark.check_datasource
 def test_dataframe_schema(bna_data):
  _, bna_in_data = bna_data
  # Check schema of the downloaded Excel file
  assert verify_schema_follows(bna_in_data, EXPECTED_DATA_SCHEMA), "Mismatch in schema of the downloaded Excel file!"
-
-
-@pytest.mark.integration_test
-def test_dataframe_shape(bna_data):
- _, bna_in_data = bna_data
- # Check shape of the dataframe
- # Not exact check, because file grows over time
- # Expected: at least 54,223 rows and 23 columns
- num_rows, num_cols = bna_in_data.shape
- assert num_rows >= 54_223, "Mismatch in dataframe shape: too few rows!"
- assert num_cols >= 23, "Mismatch in dataframe shape: too few columns!"
diff --git a/tests/integration/test_int_fr_france.py b/tests/integration/test_int_fr_france.py
@@ -2,17 +2,45 @@
 
 import os
 import tempfile
-
 import pytest
 
 from charging_stations_pipelines.pipelines.fr.france import FraPipeline
-from test.shared import skip_if_github
+from tests.test_utils import verify_schema_follows
 
+EXPECTED_DATA_SCHEMA = {
+ "id_station_itinerance": "object",
+ "nom_operateur": "object",
+ "consolidated_longitude": "float64",
+ "consolidated_latitude": "float64",
+ "date_mise_en_service": "object",
+ "date_maj": "object",
+ "nbre_pdc": "int64",
+ "adresse_station": "object",
+ "consolidated_commune": "object",
+ "consolidated_code_postal": "float64",
+}
 
-@pytest.mark.integration_test
-@pytest.mark.skipif(skip_if_github(), reason="Skip the test when running on Github")
-def test_download_france_gov_file():
- """Test the download function."""
+
+@pytest.fixture(scope="module")
+def fr_data():
+ """Setup method for tests. Executes once at the beginning of the test session (and not before each test)."""
+ # Download real FR GOV data to a temporary file
  with tempfile.NamedTemporaryFile() as temp_file:
  FraPipeline.download_france_gov_file(temp_file.name)
- assert os.path.getsize(temp_file.name) >= 47_498_370 # ~ 50 MB
+ fr_dataframe = FraPipeline.load_csv_file(temp_file.name)
+ yield temp_file.name, fr_dataframe
+
+
+@pytest.mark.integration_test
+@pytest.mark.check_datasource
+def test_download_france_gov_file(fr_data):
+ """Test the download function."""
+ fr_filename, _ = fr_data
+ assert os.path.getsize(fr_filename) >= 1_000 # actual file is ~ 45 MB, just make sure it is not quasi empty here
+
+
+@pytest.mark.integration_test
+@pytest.mark.check_datasource
+def test_dataframe_schema(fr_data):
+ _, fr_dataframe = fr_data
+ assert verify_schema_follows(fr_dataframe, EXPECTED_DATA_SCHEMA), "Mismatch in schema of the downloaded csv file!"