From 5367deb66879da5f084071018d4aa76b84023d8b Mon Sep 17 00:00:00 2001 From: belthlemar Date: Tue, 9 Jul 2024 18:02:26 +0200 Subject: [PATCH] add arrow support in put raw --- antarest/study/service.py | 14 ++- antarest/study/web/raw_studies_blueprint.py | 2 +- requirements-dev.txt | 1 + .../test_fetch_raw_data.py | 95 ++++--------------- 4 files changed, 34 insertions(+), 78 deletions(-) diff --git a/antarest/study/service.py b/antarest/study/service.py index f181d874eb..055d8df18f 100644 --- a/antarest/study/service.py +++ b/antarest/study/service.py @@ -14,6 +14,8 @@ import numpy as np import pandas as pd +import pyarrow as pa +import pyarrow.feather as feather from fastapi import HTTPException, UploadFile from markupsafe import escape from starlette.responses import FileResponse, Response @@ -1440,9 +1442,15 @@ def _create_edit_study_command( ) elif isinstance(tree_node, InputSeriesMatrix): if isinstance(data, bytes): - # noinspection PyTypeChecker - matrix = np.loadtxt(io.BytesIO(data), delimiter="\t", dtype=np.float64, ndmin=2) - matrix = matrix.reshape((1, 0)) if matrix.size == 0 else matrix + # checks if it corresponds to arrow format or if it's a classic file. + if data[:5].decode("utf-8") == "ARROW": + buffer = pa.BufferReader(data) # type: ignore + table = feather.read_table(buffer) + df = table.to_pandas() + matrix = df.to_numpy() + else: + matrix = np.loadtxt(io.BytesIO(data), delimiter="\t", dtype=np.float64, ndmin=2) + matrix = matrix.reshape((1, 0)) if matrix.size == 0 else matrix return ReplaceMatrix( target=url, matrix=matrix.tolist(), diff --git a/antarest/study/web/raw_studies_blueprint.py b/antarest/study/web/raw_studies_blueprint.py index a76b5a07e0..cb05a30b39 100644 --- a/antarest/study/web/raw_studies_blueprint.py +++ b/antarest/study/web/raw_studies_blueprint.py @@ -385,7 +385,7 @@ def replace_study_file( Parameters: - `uuid`: The UUID of the study. - `path`: The path to the data to update. Defaults to "/". - - `file`: The raw file to be posted (e.g. a CSV file opened in binary mode). + - `file`: The raw file to be posted (e.g. a CSV file opened in binary mode or a matrix in arrow format). - `create_missing`: Flag to indicate whether to create file or parent directories if missing. """ logger.info( diff --git a/requirements-dev.txt b/requirements-dev.txt index e7ff79c736..808596e882 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -11,6 +11,7 @@ pyinstaller-hooks-contrib==2024.6 # of the corresponding implementation libraries used in production (in `requirements.txt`). pandas-stubs~=1.4.0 +pyarrow-stubs~=10.0.1.7 types-psycopg2~=2.9.4 types-redis~=4.1.2 types-requests~=2.27.1 diff --git a/tests/integration/raw_studies_blueprint/test_fetch_raw_data.py b/tests/integration/raw_studies_blueprint/test_fetch_raw_data.py index f19d426d30..6cf82a8239 100644 --- a/tests/integration/raw_studies_blueprint/test_fetch_raw_data.py +++ b/tests/integration/raw_studies_blueprint/test_fetch_raw_data.py @@ -46,7 +46,7 @@ def test_get_study( with db(): study: RawStudy = db.session.get(Study, internal_study_id) study_dir = pathlib.Path(study.path) - headers = {"Authorization": f"Bearer {user_access_token}"} + client.headers = {"Authorization": f"Bearer {user_access_token}"} shutil.copytree( ASSETS_DIR.joinpath("user"), @@ -58,11 +58,7 @@ def test_get_study( user_folder_dir = study_dir.joinpath("user/folder") for file_path in user_folder_dir.glob("*.*"): rel_path = file_path.relative_to(study_dir).as_posix() - res = client.get( - f"/v1/studies/{internal_study_id}/raw", - params={"path": rel_path, "depth": 1}, - headers=headers, - ) + res = client.get(f"/v1/studies/{internal_study_id}/raw", params={"path": rel_path, "depth": 1}) assert res.status_code == 200, res.json() if file_path.suffix == ".json": # special case for JSON files @@ -85,9 +81,7 @@ def test_get_study( for file_path in user_folder_dir.glob("*.*"): rel_path = file_path.relative_to(study_dir) res = client.get( - f"/v1/studies/{internal_study_id}/raw", - params={"path": f"/{rel_path.as_posix()}", "depth": 1}, - headers=headers, + f"/v1/studies/{internal_study_id}/raw", params={"path": f"/{rel_path.as_posix()}", "depth": 1} ) assert res.status_code == 200, res.json() actual = res.content @@ -95,11 +89,7 @@ def test_get_study( assert actual == expected # If you try to retrieve a file that doesn't exist, we should have a 404 error - res = client.get( - f"/v1/studies/{internal_study_id}/raw", - params={"path": "user/somewhere/something.txt"}, - headers=headers, - ) + res = client.get(f"/v1/studies/{internal_study_id}/raw", params={"path": "user/somewhere/something.txt"}) assert res.status_code == 404, res.json() assert res.json() == { "description": "'somewhere' not a child of User", @@ -111,7 +101,6 @@ def test_get_study( res = client.put( f"/v1/studies/{internal_study_id}/raw", params={"path": "user/somewhere/something.txt"}, - headers=headers, files={"file": io.BytesIO(b"Goodbye World!")}, ) assert res.status_code == 404, res.json() @@ -125,7 +114,6 @@ def test_get_study( res = client.put( f"/v1/studies/{internal_study_id}/raw", params={"path": "user/somewhere/something.txt", "create_missing": True}, - headers=headers, files={"file": io.BytesIO(b"Goodbye Cruel World!")}, ) assert res.status_code == 204, res.json() @@ -135,27 +123,18 @@ def test_get_study( res = client.put( f"/v1/studies/{internal_study_id}/raw", params={"path": "user/somewhere/something.txt", "create_missing": True}, - headers=headers, files={"file": io.BytesIO(b"This is the end!")}, ) assert res.status_code == 204, res.json() # You can check that the resource has been created or updated. - res = client.get( - f"/v1/studies/{internal_study_id}/raw", - params={"path": "user/somewhere/something.txt"}, - headers=headers, - ) + res = client.get(f"/v1/studies/{internal_study_id}/raw", params={"path": "user/somewhere/something.txt"}) assert res.status_code == 200, res.json() assert res.content == b"This is the end!" # If we ask for properties, we should have a JSON content rel_path = "/input/links/de/properties/fr" - res = client.get( - f"/v1/studies/{internal_study_id}/raw", - params={"path": rel_path, "depth": 2}, - headers=headers, - ) + res = client.get(f"/v1/studies/{internal_study_id}/raw", params={"path": rel_path, "depth": 2}) assert res.status_code == 200, res.json() actual = res.json() assert actual == { @@ -177,32 +156,20 @@ def test_get_study( # If we ask for a matrix, we should have a JSON content if formatted is True rel_path = "/input/links/de/fr" expected_row = [100000, 100000, 0.01, 0.01, 0, 0, 0, 0] - res = client.get( - f"/v1/studies/{internal_study_id}/raw", - params={"path": rel_path, "formatted": True}, - headers=headers, - ) + res = client.get(f"/v1/studies/{internal_study_id}/raw", params={"path": rel_path, "formatted": True}) assert res.status_code == 200, res.json() old_result = res.json() assert old_result == {"index": ANY, "columns": ANY, "data": ANY} assert old_result["data"][0] == expected_row # We should have the same result with new flag 'format' set to 'JSON' - res = client.get( - f"/v1/studies/{internal_study_id}/raw", - params={"path": rel_path, "format": "json"}, - headers=headers, - ) + res = client.get(f"/v1/studies/{internal_study_id}/raw", params={"path": rel_path, "format": "json"}) assert res.status_code == 200, res.json() new_result = res.json() assert new_result == old_result # If we ask for a matrix, we should have a CSV content if formatted is False - res = client.get( - f"/v1/studies/{internal_study_id}/raw", - params={"path": rel_path, "formatted": False}, - headers=headers, - ) + res = client.get(f"/v1/studies/{internal_study_id}/raw", params={"path": rel_path, "formatted": False}) assert res.status_code == 200, res.json() old_result = res.text actual_lines = old_result.splitlines() @@ -210,36 +177,25 @@ def test_get_study( assert first_row == expected_row # We should have the same result with new flag 'format' set to 'bytes' - res = client.get( - f"/v1/studies/{internal_study_id}/raw", - params={"path": rel_path, "format": "bytes"}, - headers=headers, - ) + res = client.get(f"/v1/studies/{internal_study_id}/raw", params={"path": rel_path, "format": "bytes"}) assert res.status_code == 200, res.json() new_result = res.text assert new_result == old_result # If we ask for a matrix, we should have arrow binary if format = "arrow" - res = client.get( - f"/v1/studies/{internal_study_id}/raw", - params={"path": rel_path, "format": "arrow"}, - headers=headers, - ) + res = client.get(f"/v1/studies/{internal_study_id}/raw", params={"path": rel_path, "format": "arrow"}) assert res.status_code == 200 assert isinstance(res.content, bytes) assert res.text.startswith("ARROW") - buffer = pa.BufferReader(res.content) + arrow_bytes = res.content + buffer = pa.BufferReader(arrow_bytes) table = feather.read_table(buffer) df = table.to_pandas() assert list(df.loc[0]) == expected_row # Asserts output matrix (containing index and columns) can be retrieved with arrow output_path = "/output/20201014-1422eco-hello/economy/mc-all/areas/de/id-daily" - res = client.get( - f"/v1/studies/{internal_study_id}/raw", - params={"path": output_path, "format": "arrow"}, - headers=headers, - ) + res = client.get(f"/v1/studies/{internal_study_id}/raw", params={"path": output_path, "format": "arrow"}) assert res.status_code == 200 assert isinstance(res.content, bytes) assert res.text.startswith("ARROW") @@ -248,11 +204,14 @@ def test_get_study( df = table.to_pandas() assert df.columns[0] == "Index" # asserts the first columns corresponds to the index in such a case. + # Try to replace a matrix with a one in arrow format + res = client.put(f"/v1/studies/{internal_study_id}/raw", params={"path": rel_path}, files={"file": arrow_bytes}) + assert res.status_code in {201, 204} + # If ask for an empty matrix, we should have an empty binary content res = client.get( f"/v1/studies/{internal_study_id}/raw", params={"path": "input/thermal/prepro/de/01_solar/data", "formatted": False}, - headers=headers, ) assert res.status_code == 200, res.json() assert res.content == b"" @@ -261,7 +220,6 @@ def test_get_study( res = client.get( f"/v1/studies/{internal_study_id}/raw", params={"path": "input/thermal/prepro/de/01_solar/data", "formatted": True}, - headers=headers, ) assert res.status_code == 200, res.json() assert res.json() == {"index": [0], "columns": [], "data": []} @@ -271,19 +229,13 @@ def test_get_study( for file_path in user_folder_dir.glob("*.*"): rel_path = file_path.relative_to(study_dir) res = client.get( - f"/v1/studies/{internal_study_id}/raw", - params={"path": f"/{rel_path.as_posix()}", "depth": 1}, - headers=headers, + f"/v1/studies/{internal_study_id}/raw", params={"path": f"/{rel_path.as_posix()}", "depth": 1} ) assert res.status_code == http.HTTPStatus.UNPROCESSABLE_ENTITY # We can access to the configuration the classic way, # for instance, we can get the list of areas: - res = client.get( - f"/v1/studies/{internal_study_id}/raw", - params={"path": "/input/areas/list", "depth": 1}, - headers=headers, - ) + res = client.get(f"/v1/studies/{internal_study_id}/raw", params={"path": "/input/areas/list", "depth": 1}) assert res.status_code == 200, res.json() assert res.json() == ["DE", "ES", "FR", "IT"] @@ -291,16 +243,11 @@ def test_get_study( res = client.get( f"/v1/studies/{internal_study_id}/raw", params={"path": "output/20201014-1427eco/economy/mc-all/areas/de/id-monthly"}, - headers=headers, ) assert res.status_code == 200 assert np.isnan(res.json()["data"][0]).any() # Iterate over all possible combinations of path and depth for path, depth in itertools.product([None, "", "/"], [0, 1, 2]): - res = client.get( - f"/v1/studies/{internal_study_id}/raw", - params={"path": path, "depth": depth}, - headers=headers, - ) + res = client.get(f"/v1/studies/{internal_study_id}/raw", params={"path": path, "depth": depth}) assert res.status_code == 200, f"Error for path={path} and depth={depth}"