diff --git a/.github/workflows/cron-update-kg-proxy.yml b/.github/workflows/cron-update-kg-proxy.yml new file mode 100644 index 000000000..7c70aaec1 --- /dev/null +++ b/.github/workflows/cron-update-kg-proxy.yml @@ -0,0 +1,73 @@ +name: '[cron] update KG proxy' + +on: + schedule: + - cron: '5 4 * * 0' # every Sunday at 4am + +env: + EBRAINS_CCFLOW_CLIENT_ID: ${{ secrets.EBRAINS_OIDC_SIIBRA_CI_CLIENT_ID }} + EBRAINS_CCFLOW_CLIENT_SECRET: ${{ secrets.EBRAINS_OIDC_SIIBRA_CI_CLIENT_SECRET }} + +jobs: + update: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - artefacts-dir: "ebrainsquery/v3/AtlasAnnotation" + query-id: "66c69c41-ad91-4915-ac23-897fd0c60211" + + - artefacts-dir: "ebrainsquery/v3/ParcellationEntity" + query-id: "d236e213-c048-4331-bfd9-b45c60bc3d03" + + - artefacts-dir: "ebrainsquery/v3/ParcellationEntityVersion" + query-id: "5e80bb84-68cf-4425-859b-5306f6838693" + + - artefacts-dir: "ebrainsquery/v3/CustomAnatomicalEntity" + query-id: "d7cdc1d6-0aa5-498a-94ff-0291b95db13d" + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: 'Install dependencies' + run: | + pip install ebrains-drive + pip install git+https://github.com/xgui3783/ebrains-iam-util.git + - name: 'Fetch and Sync to KG proxy' + run: | + export EBRAINS_QUERY_ID=${{ matrix.query-id }} + export EBRAINS_ARTEFACTS_DIR=${{ matrix.artefacts-dir }} + export EBRAINS_CCFLOW_CLIENT_ID=${{ env.EBRAINS_CCFLOW_CLIENT_ID }} + export EBRAINS_CCFLOW_CLIENT_SECRET=${{ env.EBRAINS_CCFLOW_CLIENT_SECRET }} + + python _ci/cron_sync_kg_proxy.py + + latest-siibra-release: + runs-on: ubuntu-latest + outputs: + LATEST_TAG: ${{ steps.get-latest-tag.outputs.LATEST_TAG }} + steps: + - id: get-latest-tag + run: | + LATEST_TAG=$(gh release -R fzj-inm1-bda/siibra-python ls --jq '.[0].tagName' --json 'tagName') + echo LATEST_TAG=$LATEST_TAG + echo "LATEST_TAG=$LATEST_TAG" >> $GITHUB_OUTPUT + + check-region-e2e: + runs-on: ubuntu-latest + needs: + - update + - latest-siibra-release + steps: + - uses: actions/checkout@v4 + with: + repository: 'fzj-inm1-bda/siibra-python' + ref: ${{ needs.latest-siibra-release.outputs.LATEST_TAG }} + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + - run: pip install -U pip && pip install -e . && pip install pytest + - run: pytest e2e/core/test_region.py + diff --git a/_ci/cron_sync_kg_proxy.py b/_ci/cron_sync_kg_proxy.py new file mode 100644 index 000000000..82740ec30 --- /dev/null +++ b/_ci/cron_sync_kg_proxy.py @@ -0,0 +1,97 @@ +import os +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor +from itertools import repeat +import json +from io import StringIO + +from tqdm import tqdm +import requests +from ebrains_iam.client_credential import ClientCredentialsSession +from ebrains_drive import BucketApiClient + +EBRAINS_CCFLOW_CLIENT_ID = os.getenv("EBRAINS_CCFLOW_CLIENT_ID") +EBRAINS_CCFLOW_CLIENT_SECRET = os.getenv("EBRAINS_CCFLOW_CLIENT_SECRET") + +EBRAINS_QUERY_ID = os.getenv("EBRAINS_QUERY_ID") +EBRAINS_ARTEFACTS_DIR = os.getenv("EBRAINS_ARTEFACTS_DIR") + +KG_ROOT = os.getenv("KG_ROOT", "https://core.kg.ebrains.eu") + +def get_paginated(url: str, size: int, from_: int, token: str): + resp = requests.get(url, + params={"stage": "RELEASED", "size": str(size), "from": str(from_)}, + headers={ "Authorization": f"Bearer {token}" }) + resp.raise_for_status() + return resp.json() + +def write_to_file(path: str, data): + with open(path, "w") as fp: + json.dump(data, indent="\t", fp=fp) + fp.write("\n") + +def main(): + assert EBRAINS_CCFLOW_CLIENT_ID, f"EBRAINS_CCFLOW_CLIENT_ID must be defined" + assert EBRAINS_CCFLOW_CLIENT_SECRET, f"EBRAINS_CCFLOW_CLIENT_SECRET must be defined" + assert EBRAINS_QUERY_ID, f"EBRAINS_QUERY_ID must be defined" + assert EBRAINS_ARTEFACTS_DIR, f"EBRAINS_ARTEFACTS_DIR must be defined!" + + iamclient = ClientCredentialsSession(EBRAINS_CCFLOW_CLIENT_ID, EBRAINS_CCFLOW_CLIENT_SECRET, scope=["team"]) + + path = Path(EBRAINS_ARTEFACTS_DIR) + path.mkdir(parents=True, exist_ok=True) + + token = iamclient.get_token() + + url=f"{KG_ROOT}/v3/queries/{EBRAINS_QUERY_ID}/instances" + resp = get_paginated(url, 50, 0, token=token) + + total = resp.get("total") + + print(f"Getting {total=} instances, paginated by 50...") + + with ThreadPoolExecutor(max_workers=6) as ex: + remaining_results = list(ex.map( + get_paginated, + repeat(url), + repeat(50), + range(50, total, 50), + repeat(token) + )) + + print("Uploading files ...") + + bucketclient = BucketApiClient(token=token) + bucket = bucketclient.buckets.get_bucket("reference-atlas-data") + + all_atlas_annotations = [datum for r in [resp, *remaining_results] for datum in r.get("data")] + + progress = tqdm(total=len(all_atlas_annotations), desc="Uploading", unit="Files") + + for datum in all_atlas_annotations: + io = StringIO(json.dumps(datum, indent="\t")) + io.seek(0) + _id = datum.get("id").split("/")[-1] + + retry_counter = 0 + while True: + try: + bucket.upload(io, f"{EBRAINS_ARTEFACTS_DIR}/{_id}.json") + progress.update() + break + except Exception as e: + print(f"Error: {str(e)}") + if retry_counter >= 5: + print(f"Retry max hit, terminating") + raise e from e + retry_counter += 1 + print("Retrying ...") + + token = iamclient.get_token() + bucketclient = BucketApiClient(token=token) + bucket = bucketclient.buckets.get_bucket("reference-atlas-data") + + +if __name__ == "__main__": + main() +