Skip to content

Commit

Permalink
Merge pull request #68 from comsysto/fix/FRGOV-pipeline-not-working
Browse files Browse the repository at this point in the history
Fix FR governmental data pipeline
  • Loading branch information
mjmader authored Feb 2, 2024
2 parents 6372095 + 51b0c2d commit 1581c53
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 39 deletions.
13 changes: 6 additions & 7 deletions .github/workflows/check-datasources.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,12 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -r test/requirements.txt
# - name: Run integration tests (only)
# run: |
# pip install -r test/requirements.txt
# pytest -m "integration_test"

- name: "[DE/BNA] Real data validity checks"
- name: "[DE/BNA] Data retrieval check"
run: |
pip install -r test/requirements.txt
pytest tests/integration/test_int_de_bna.py
- name: "[FR] Data retrieval check"
run: |
pytest tests/integration/test_int_fr_france.py
2 changes: 1 addition & 1 deletion .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,4 @@ jobs:
- name: Run tests
run: |
pip install -r test/requirements.txt
pytest
pytest -m "not check_datasource"
19 changes: 12 additions & 7 deletions charging_stations_pipelines/pipelines/fr/france.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,7 @@ def _retrieve_data(self):
if self.online:
logger.info("Retrieving Online Data")
self.download_france_gov_file(tmp_data_path)
self.data = pd.read_csv(
os.path.join(data_dir, "france_stations.csv"),
delimiter=",",
encoding="utf-8",
encoding_errors="replace",
)
self.data = self.load_csv_file(tmp_data_path)

def run(self):
logger.info("Running FR GOV Pipeline...")
Expand All @@ -55,7 +50,7 @@ def run(self):
@staticmethod
def download_france_gov_file(target_file):
"""Download a file from the French government website."""
base_url = "https://transport.data.gouv.fr/resources/79624"
base_url = "https://transport.data.gouv.fr/resources/81548"

r = requests.get(base_url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(r.content, "html.parser")
Expand All @@ -73,3 +68,13 @@ def download_france_gov_file(target_file):
"Could not determine source for french government data",
)
download_file(link_to_dataset[0]["href"], target_file)

@staticmethod
def load_csv_file(target_file):
return pd.read_csv(
target_file,
delimiter=",",
encoding="utf-8",
encoding_errors="replace",
low_memory=False,
)
4 changes: 0 additions & 4 deletions charging_stations_pipelines/pipelines/fr/france_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,11 @@ def map_station_fra(row: pd.Series) -> Station:
float(check_coordinates(row.get("consolidated_latitude"))),
)
)
station.date_created = row.get("date_mise_en_service").strptime("%Y-%m-%d")
station.date_updated = row.get("date_maj").strptime("%Y-%m-%d")

if not pd.isna(row.get("date_mise_en_service")):
station.date_created = datetime.strptime(row.get("date_mise_en_service"), "%Y-%m-%d")
if not pd.isna(row.get("date_maj")):
station.date_updated = datetime.strptime(row.get("date_maj"), "%Y-%m-%d")
else:
station.date_updated = datetime.now

return station

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,6 @@ testpaths = [

# Declare custom markers
markers = [
"integration_test: marks tests as integration tests, which are bit slow (deselect with '-m \"integration_test\"')",
"integration_test: marks tests as integration tests (deselect with '-m \"not integration_test\"')",
"check_datasource: marks tests as datasource check for scheduled github action (deselect with '-m \"not check_datasource\"')",
]
15 changes: 3 additions & 12 deletions tests/integration/test_int_de_bna.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,25 +58,16 @@ def bna_data():


@pytest.mark.integration_test
@pytest.mark.check_datasource
def test_file_size(bna_data):
bna_file_name, _ = bna_data
# Check file size of the downloaded file
assert os.path.getsize(bna_file_name) >= 8_602_458 # ~ 9 MB
assert os.path.getsize(bna_file_name) >= 1_000 # actual file is ~ 9 MB, just make sure it is not quasi empty here


@pytest.mark.integration_test
@pytest.mark.check_datasource
def test_dataframe_schema(bna_data):
_, bna_in_data = bna_data
# Check schema of the downloaded Excel file
assert verify_schema_follows(bna_in_data, EXPECTED_DATA_SCHEMA), "Mismatch in schema of the downloaded Excel file!"


@pytest.mark.integration_test
def test_dataframe_shape(bna_data):
_, bna_in_data = bna_data
# Check shape of the dataframe
# Not exact check, because file grows over time
# Expected: at least 54,223 rows and 23 columns
num_rows, num_cols = bna_in_data.shape
assert num_rows >= 54_223, "Mismatch in dataframe shape: too few rows!"
assert num_cols >= 23, "Mismatch in dataframe shape: too few columns!"
42 changes: 35 additions & 7 deletions tests/integration/test_int_fr_france.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,45 @@

import os
import tempfile

import pytest

from charging_stations_pipelines.pipelines.fr.france import FraPipeline
from test.shared import skip_if_github
from tests.test_utils import verify_schema_follows

EXPECTED_DATA_SCHEMA = {
"id_station_itinerance": "object",
"nom_operateur": "object",
"consolidated_longitude": "float64",
"consolidated_latitude": "float64",
"date_mise_en_service": "object",
"date_maj": "object",
"nbre_pdc": "int64",
"adresse_station": "object",
"consolidated_commune": "object",
"consolidated_code_postal": "float64",
}

@pytest.mark.integration_test
@pytest.mark.skipif(skip_if_github(), reason="Skip the test when running on Github")
def test_download_france_gov_file():
"""Test the download function."""

@pytest.fixture(scope="module")
def fr_data():
"""Setup method for tests. Executes once at the beginning of the test session (and not before each test)."""
# Download real FR GOV data to a temporary file
with tempfile.NamedTemporaryFile() as temp_file:
FraPipeline.download_france_gov_file(temp_file.name)
assert os.path.getsize(temp_file.name) >= 47_498_370 # ~ 50 MB
fr_dataframe = FraPipeline.load_csv_file(temp_file.name)
yield temp_file.name, fr_dataframe


@pytest.mark.integration_test
@pytest.mark.check_datasource
def test_download_france_gov_file(fr_data):
"""Test the download function."""
fr_filename, _ = fr_data
assert os.path.getsize(fr_filename) >= 1_000 # actual file is ~ 45 MB, just make sure it is not quasi empty here


@pytest.mark.integration_test
@pytest.mark.check_datasource
def test_dataframe_schema(fr_data):
_, fr_dataframe = fr_data
assert verify_schema_follows(fr_dataframe, EXPECTED_DATA_SCHEMA), "Mismatch in schema of the downloaded csv file!"

0 comments on commit 1581c53

Please sign in to comment.