From 883609717e10651b011e39a2dfb83145c5ad834c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Thu, 10 Oct 2024 14:18:32 +0200 Subject: [PATCH] data: request a taxonomy tree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As part of the data we pull from S3, we need another folder that contains the taxonomy tree. Signed-off-by: Sébastien Han --- standalone/README.md | 8 +++++--- standalone/standalone.py | 3 +-- standalone/standalone.tpl | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/standalone/README.md b/standalone/README.md index f7c114f..35f8e90 100644 --- a/standalone/README.md +++ b/standalone/README.md @@ -116,13 +116,15 @@ In this scenario the name of the bucket is `sdg-data` and the tarball file is `d ```bash ilab data generate mv generated data -tar -czvf data.tar.gz data model +tar -czvf data.tar.gz data model taxonomy aws cp data.tar.gz s3://sdg-data/data.tar.gz ``` > [!CAUTION] -> Ensures SDG data are in a directory called "data" and the model is in a directory called "model". -> The tarball must contain two top-level directories: `data` and `model`. +> Ensures SDG data are in a directory called "data". +> Ensures the model to train is in a directory called "model". +> Ensures that the taxonomy tree used to generate the SDG data is in a directory called "taxonomy". +> The tarball must contain three top-level directories: `data`, `model` and `taxonomy`. > [!CAUTION] > Make sure the tarball format is .tar.gz. diff --git a/standalone/standalone.py b/standalone/standalone.py index f5113f7..374bb28 100755 --- a/standalone/standalone.py +++ b/standalone/standalone.py @@ -313,7 +313,7 @@ def upload_s3_file(): top_level_dirs=$(tar --exclude='*/*' --list --file {data_pvc_mount_path}/data.tar.gz) # Loop through the expected directories and check if they exist in the archive - for dir in data model; do + for dir in data model taxonomy; do if ! echo "$top_level_dirs" | grep -q "^$dir/$"; then echo "Archive does not contain a '$dir' directory" exit 1 @@ -1268,7 +1268,6 @@ def data_processing(train_args: TrainingArgs) -> None: container = kubernetes.client.V1Container( name="sdg-preprocess", - # image="quay.io/tcoufal/ilab-sdg:latest", image=RHELAI_IMAGE, command=["/bin/sh", "-ce"], args=[ diff --git a/standalone/standalone.tpl b/standalone/standalone.tpl index 4fe917f..d7ffe02 100755 --- a/standalone/standalone.tpl +++ b/standalone/standalone.tpl @@ -298,7 +298,7 @@ if [ "$STRATEGY" == "download" ]; then top_level_dirs=$(tar --exclude='*/*' --list --file {data_pvc_mount_path}/data.tar.gz) # Loop through the expected directories and check if they exist in the archive - for dir in data model; do + for dir in data model taxonomy; do if ! echo "$top_level_dirs" | grep -q "^$dir/$"; then echo "Archive does not contain a '$dir' directory" exit 1