From 6ef81be3d52ca1579616ff90ac020d6eaf972713 Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Wed, 11 Dec 2024 09:50:22 +0100 Subject: [PATCH] Change SILO container to run for the short-read sequences in the metadata.s3Link column of get-released-data --- kubernetes/loculus/silo_import_job.sh | 36 +++++++++- .../loculus/templates/_siloDatabaseConfig.tpl | 65 ++++++++++++------- .../templates/lapis-silo-deployment.yaml | 9 +++ kubernetes/loculus/values_preview_server.yaml | 7 +- 4 files changed, 91 insertions(+), 26 deletions(-) diff --git a/kubernetes/loculus/silo_import_job.sh b/kubernetes/loculus/silo_import_job.sh index c4c30396f7..15e207646c 100755 --- a/kubernetes/loculus/silo_import_job.sh +++ b/kubernetes/loculus/silo_import_job.sh @@ -148,14 +148,44 @@ download_data() { echo } +extract_short_read_files_from_s3() { + # Input from https://backend-wise-seqs.loculus.org/test/get-released-data + + aws configure set aws_access_key_id "$AWS_ACCESS_KEY" + aws configure set aws_secret_access_key "$AWS_SECRET_ACCESS_KEY" + aws configure set region "$AWS_DEFAULT_REGION" + + S3_LINKS_FILE="tmp_s3_links.txt" + + # Extract S3 links from the metadata + jq -r '.metadata.s3Link' "$new_input_data_path" > "$S3_LINKS_FILE" + + touch "$silo_input_data_path" + + # Loop through each S3 link and append the content to the output file + while read -r S3_LINK; do + # Temporary file for downloaded content + TEMP_FILE=$(mktemp) + + # Download the ndjson file from S3 + aws s3 cp "$S3_LINK" "$TEMP_FILE" + + # Append the content to the output file + cat "$TEMP_FILE" >> "$silo_input_data_path" + + # Clean up the temporary file + rm "$TEMP_FILE" + done < "$S3_LINKS_FILE" +} + preprocessing() { echo "Starting preprocessing" rm -f "$silo_input_data_path" - # This is necessary because the silo preprocessing is configured to expect the input data - # at /preprocessing/input/data.ndjson.zst - cp "$new_input_data_path" "$silo_input_data_path" + # take data from $new_input_data_path, get all data from the S3 buckets (referenced in column s3Link) + # and put it into $silo_input_data_path + extract_short_read_files_from_s3 set +e time /app/siloApi --preprocessing diff --git a/kubernetes/loculus/templates/_siloDatabaseConfig.tpl b/kubernetes/loculus/templates/_siloDatabaseConfig.tpl index bc22a36fdf..34606d0085 100644 --- a/kubernetes/loculus/templates/_siloDatabaseConfig.tpl +++ b/kubernetes/loculus/templates/_siloDatabaseConfig.tpl @@ -12,27 +12,48 @@ {{- define "loculus.siloDatabaseConfig" }} schema: - {{- $segments := .nucleotideSequences | default (list "main")}} - {{- $is_segmented := gt (len $segments) 1 }} - instanceName: {{ .organismName }} - opennessLevel: OPEN metadata: - {{- range (concat .commonMetadata .metadata) }} - {{- $currentItem := . }} - {{- if and $is_segmented .perSegment }} - {{- range $segment := $segments }} - {{- with $currentItem }} - {{- include "loculus.siloDatabaseShared" . | nindent 4 }} - name: {{ printf "%s_%s" .name $segment | quote}} - {{- end }} - {{- end }} - {{- else }} - {{- include "loculus.siloDatabaseShared" . | nindent 4 }} - name: {{ .name }} - {{- end }} - {{- end }} - primaryKey: accessionVersion -{{ if .silo}} - {{- .silo | toYaml | nindent 2 }} -{{ end }} + - name: sample_id + type: string + generateIndex: false + - name: batch_id + type: string + generateIndex: false + - name: sequencing_well_position + type: string + generateIndex: false + - name: location_code + type: string + generateIndex: false + - name: sampling_date + type: date + generateIndex: false + - name: sequencing_date + type: string + generateIndex: false + - name: flow_cell_serial_number + type: string + generateIndex: false + - name: read_length + type: int + generateIndex: false + - name: primer_protocol + type: string + generateIndex: false + - name: location_name + type: string + generateIndex: false + - name: primer_protocol_name + type: string + generateIndex: false + - name: nextclade_reference + type: string + generateIndex: false + - name: read_id + type: string + generateIndex: false + opennessLevel: OPEN + instanceName: wise-sarsCoV2 + features: [] + primaryKey: read_id {{- end }} diff --git a/kubernetes/loculus/templates/lapis-silo-deployment.yaml b/kubernetes/loculus/templates/lapis-silo-deployment.yaml index c5df2c873d..ec21d7a634 100644 --- a/kubernetes/loculus/templates/lapis-silo-deployment.yaml +++ b/kubernetes/loculus/templates/lapis-silo-deployment.yaml @@ -84,6 +84,15 @@ spec: {{- else }} value: "http://loculus-backend-service:8079/{{ $key }}" {{- end }} + - name: AWS_DEFAULT_REGION + value: eu-central-1 + - name: AWS_ACCESS_KEY + value: AKIA6AB5EFK3N6KDJJ52 + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: wise-short-read-sequence-bucket + key: secret-access-key volumeMounts: - name: lapis-silo-database-config-processed mountPath: /preprocessing/input/reference_genomes.json diff --git a/kubernetes/loculus/values_preview_server.yaml b/kubernetes/loculus/values_preview_server.yaml index 7452a3b992..6660bb6844 100644 --- a/kubernetes/loculus/values_preview_server.yaml +++ b/kubernetes/loculus/values_preview_server.yaml @@ -34,4 +34,9 @@ reduceResourceRequest: true previewDocs: false robotsNoindexHeader: true disableEnaSubmission: false -additionalHeadHTML: '' \ No newline at end of file +additionalHeadHTML: '' +wise-short-read-sequence-bucket: + type: sealedsecret + clusterWide: "true" + data: + secret-access-key: 'AgB9YyY/cXTfmi44zSfySs2mRjEeZfaye4lZlNL//mMuc7kaQZcEaBQ2N0C4UBQBN3zz61T+3YrR64PSMcRC97GaEGj/fMrxc8WUW9AMzLZEHIXZRmrM+BChPCA/MGoN/ekUpBuWZnTlh48fGxQg4GlCHFrnq3fpztoHiSrmED6Q7FuWOliuWnRqObmyh7xs+6AwGcs0NRhH5yQVAjwZlL9/m8LN4Cjr1mA7yedYuSYd5Ztdy5LMHOukWH9tD+NKdH8X/BfIP6axQCUA4wUiiGWI+mXfBC1dXDaueblu1zTejloLJ3CpS9BGuzS2uxC5ac9xVifA6hljyWD8oPRQ7Rzi7Uv3gdMTUFXZBCLMLK9YQntqhqnvkroWdg4kn7J4VOKbLiHz6JiLCbiELPhpbEuvZFGQl4psANSg1ODOuaibcdMiwfJc1vnSCEzQ27ura/ubZ2v6QcEz5c1jDasG26e+n5xSHtzn1aKHPaRdBTQJa5F5TPYbFiIZbWY9+1mabNxRSVCeAusmeXCyDDI7z4NqxrLBQW5NDGZx1vvDYrPiAwcjjbNX+y7P7apPsvi23n4MNWbd5WCZ6ETyG7pirNsGmkS8kNvjlUlXdkcCF5tAvGwvfPtndxeizul4sxg98eKXSOzeiMbgmpAry42OPSgF7HbHMvurWCltIkdrutb54TIstei1qjV5MBmEXVGSM+m4Zzj1u6YoiC/DV6grnuaCE8ZCf2nWHFtiy26oBo7+7NLsHE7uROfR' \ No newline at end of file