Skip to content

Commit

Permalink
Merge branch 'main' into feature/etcd-cluster-size-configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
elijah-k-nebius committed Sep 18, 2024
2 parents ceaa737 + 757ef17 commit f4299c5
Show file tree
Hide file tree
Showing 13 changed files with 187 additions and 103 deletions.
50 changes: 41 additions & 9 deletions .github/workflows/terraform.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
pull_request:

permissions:
contents: read
contents: write

concurrency:
group: project-e00pjzzrtk1fs3yavy
Expand All @@ -22,10 +22,11 @@ jobs:
max-parallel: 2
matrix:
solution:
- name: compute-testing
- name: slurm
- name: k8s-inference
- name: k8s-training
- name: slurm
- name: compute-testing
- name: wireguard

defaults:
run:
Expand Down Expand Up @@ -61,6 +62,9 @@ jobs:
> ~/.s3cfg
mkdir -p tests/reports
- name: Install XMLStarlet
run: sudo apt install -y xmlstarlet

- name: Install Nebius CLI
run: |
curl -sSL https://storage.ai.nebius.cloud/nebius/install.sh | bash
Expand Down Expand Up @@ -107,18 +111,46 @@ jobs:

# Run Terraform Tests
- name: Terraform Test
run: terraform test -verbose -junit-xml=tests/reports/TEST-result-${{ github.run_id }}.xml
run: terraform test -junit-xml=tests/reports/TEST-result-${{ github.run_id }}.xml

- name: Test Summary
uses: test-summary/action@v2
with:
paths: ${{ matrix.solution.name }}/tests/reports/TEST-result-${{ github.run_id }}.xml
if: always()
- name: Set date in report
run: |
xmlstarlet ed \
--inplace \
-i '/testsuites' -t attr -n timestamp -v $(date --iso-8601=seconds) \
-i '/testsuites/testsuite[*]' -t attr -n timestamp -v $(date --iso-8601=seconds) \
-i '/testsuites/testsuite[*]/testcase' -t attr -n timestamp -v $(date --iso-8601=seconds) \
tests/reports/TEST-result-${{ github.run_id }}.xml
- name: Upload test results
run: s3cmd sync tests/reports s3://terraform-test-reports/${{ matrix.solution.name }}/
if: always()

- name: Load test report history
uses: actions/checkout@v4
if: always()
continue-on-error: true
with:
ref: gh-pages
path: gh-pages

- name: Build test report
uses: simple-elf/[email protected]
if: always()
with:
gh_pages: gh-pages
subfolder: ${{ matrix.solution.name }}
allure_results: ${{ matrix.solution.name }}/tests/reports

- name: Publish test report
uses: peaceiris/actions-gh-pages@v4
if: always()
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_branch: gh-pages
publish_dir: allure-history
keep_files: true

cleanup-infra:
name: 'Cleanup Infra'
environment:
Expand Down
2 changes: 1 addition & 1 deletion compute-testing/tests/main.tftest.hcl
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
run "create_cluster" {
run "compute_testing_plan" {
command = plan
}
25 changes: 24 additions & 1 deletion k8s-inference/tests/main.tftest.hcl
Original file line number Diff line number Diff line change
@@ -1,4 +1,27 @@
run "create_cluster" {
run "k8s_inference_apply" {
command = apply
plan_options {
target = [
nebius_mk8s_v1_cluster.k8s-cluster
]
}
}

run "k8s_node_groups_inference_apply" {
command = apply
plan_options {
target = [
nebius_mk8s_v1_node_group.cpu-only,
nebius_mk8s_v1_node_group.gpu
]
}
}

run "full_inference_apply" {
command = apply
}

run "test_mode_k8s_inference_apply" {
command = apply

variables {
Expand Down
29 changes: 28 additions & 1 deletion k8s-training/tests/main.tftest.hcl
Original file line number Diff line number Diff line change
@@ -1,4 +1,31 @@
run "create_cluster" {
run "k8s_training_apply" {
command = apply
plan_options {
target = [
nebius_mk8s_v1_cluster.k8s-cluster
]
}
}

run "k8s_node_groups_training_apply" {
command = apply
plan_options {
target = [
nebius_mk8s_v1_node_group.cpu-only,
nebius_mk8s_v1_node_group.gpu
]
}
}

run "full_training_apply" {
command = apply

variables {
enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket
}
}

run "test_mode_k8s_training_apply" {
command = apply

variables {
Expand Down
24 changes: 23 additions & 1 deletion slurm/tests/main.tftest.hcl
Original file line number Diff line number Diff line change
@@ -1,4 +1,26 @@
run "create_cluster" {
run "slurm_master_apply" {
command = apply

variables {
cluster_workers_count = 2
}

plan_options {
target = [
nebius_compute_v1_instance.master
]
}
}

run "slurm_full_apply" {
command = apply

variables {
cluster_workers_count = 2
}
}

run "test_mode_slurm_apply" {
command = apply

variables {
Expand Down
26 changes: 0 additions & 26 deletions tmp-tests/pv-pvc.yaml

This file was deleted.

50 changes: 0 additions & 50 deletions tmp-tests/pvc-users.yaml

This file was deleted.

16 changes: 9 additions & 7 deletions wireguard/main.tf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
resource "nebius_compute_v1_instance" "wireguard-instanse" {
resource "nebius_compute_v1_instance" "wireguard_instance" {
parent_id = var.parent_id
name = "wireguard-instanse"
name = "wireguard-instance"

boot_disk = {
attach_mode = "READ_WRITE"
Expand All @@ -9,16 +9,18 @@ resource "nebius_compute_v1_instance" "wireguard-instanse" {

network_interfaces = [
{
name = "eth0"
subnet_id = var.subnet_id
ip_address = {}
public_ip_address = var.public_ip_allocation_id != null ? { allocation_id = var.public_ip_allocation_id } : {}
name = "eth0"
subnet_id = var.subnet_id
ip_address = {}
public_ip_address = {
allocation_id = var.public_ip_allocation_id
}
}
]

resources = {
platform = "cpu-e2"
preset = "16vcpu-64gb"
preset = "4vcpu-16gb"
}


Expand Down
3 changes: 3 additions & 0 deletions wireguard/output.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
output "wg_instance_pib" {
value = trimsuffix(nebius_compute_v1_instance.wireguard_instance.status.network_interfaces[0].public_ip_address.address, "/32")
}
13 changes: 8 additions & 5 deletions wireguard/terraform.tfvars
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#parent_id = ""
#subnet_id = ""
#ssh_user_name = ""
#ssh_public_key = ""
#public_ip_allocation_id = ""
# parent_id = ""
# subnet_id = ""
# ssh_user_name = "ubuntu"
# ssh_public_key = {
# key = "put your public ssh key here"
# path = "put path to ssh key here"
# }
# public_ip_allocation_id = ""
32 changes: 32 additions & 0 deletions wireguard/test-resource.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
locals {
test_wg_host = trimsuffix(nebius_compute_v1_instance.wireguard_instance.status.network_interfaces[0].public_ip_address.address, "/32")
}

resource "null_resource" "check_wireguard_instance" {
count = var.test_mode ? 1 : 0

connection {
user = "ubuntu"
host = local.test_wg_host
}

provisioner "remote-exec" {
inline = [
"set -eu",
"cloud-init status --wait",
"ip link show wg0",
"systemctl -q status [email protected] > /dev/null",
]
}
}


resource "null_resource" "check_wireguard_web_ui" {
depends_on = [null_resource.check_wireguard_instance]
count = var.test_mode ? 1 : 0

provisioner "local-exec" {
interpreter = ["bash", "-c"]
command = "sleep 15 && curl ${local.test_wg_host}:5000"
}
}
11 changes: 11 additions & 0 deletions wireguard/tests/main.tftest.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
run "wireguard_apply" {
command = apply
}

run "test_mode_wireguard_apply" {
command = apply

variables {
test_mode = true
}
}
9 changes: 7 additions & 2 deletions wireguard/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ variable "ssh_user_name" {
}

variable "ssh_public_key" {
description = "SSH Public Key to access the cluster nodes"
description = "SSH Public Key to access the cluster nodes."
type = object({
key = optional(string),
path = optional(string, "~/.ssh/id_rsa.pub")
Expand All @@ -29,10 +29,15 @@ variable "ssh_public_key" {
}
}


# Access By IP
variable "public_ip_allocation_id" {
description = "Id of a manually created public_ip_allocation."
type = string
default = null
}

variable "test_mode" {
description = "Switch between real usage and testing."
type = bool
default = false
}

0 comments on commit f4299c5

Please sign in to comment.