Skip to content

Commit

Permalink
Merge pull request #450 from battlecode/jerrym-saturn
Browse files Browse the repository at this point in the history
Saturn devops fixes
  • Loading branch information
j-mao authored Jan 7, 2023
2 parents 0538d86 + 5786b1f commit 67d2481
Show file tree
Hide file tree
Showing 17 changed files with 131 additions and 48 deletions.
4 changes: 2 additions & 2 deletions backend/siarnaq/api/compete/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def for_saturn(self):

def enqueue_options(self):
"""Return the options to be submitted to the compilation queue."""
report_url = "https://{}{}".format(
report_url = "http://{}{}".format(
settings.ALLOWED_HOSTS[0],
reverse(
"submission-report",
Expand Down Expand Up @@ -256,7 +256,7 @@ def for_saturn(self):

def enqueue_options(self):
"""Return the options to be submitted to the execution queue."""
report_url = "https://{}{}".format(
report_url = "http://{}{}".format(
settings.ALLOWED_HOSTS[0],
reverse(
"match-report",
Expand Down
4 changes: 3 additions & 1 deletion backend/siarnaq/api/compete/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ class AlreadyFinalized(APIException):

class SaturnInvocationSerializer(serializers.Serializer):
status = serializers.ChoiceField(SaturnStatus.choices)
logs = serializers.CharField(required=False)
logs = serializers.CharField(
required=False, allow_blank=True, trim_whitespace=False
)
interrupted = serializers.BooleanField(required=False)

def update(self, instance, validated_data):
Expand Down
2 changes: 1 addition & 1 deletion backend/siarnaq/api/episodes/signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def update_autoscrim_schedule(instance, update_fields, **kwargs):
f"{parent}/jobs/"
f"{settings.GCLOUD_SCHEDULER_PREFIX}-autoscrim-{instance.name_short}"
)
url = "https://{}{}".format(
url = "http://{}{}".format(
settings.ALLOWED_HOSTS[0],
reverse("episode-autoscrim", kwargs={"pk": instance.pk}),
)
Expand Down
4 changes: 4 additions & 0 deletions deploy/cd/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,7 @@ output "artifact_image" {
k => "${google_artifact_registry_repository.this.location}-docker.pkg.dev/${google_artifact_registry_repository.this.project}/${google_artifact_registry_repository.this.repository_id}/${k}"
}
}

output "artifact_registry_name" {
value = google_artifact_registry_repository.this.name
}
14 changes: 8 additions & 6 deletions deploy/galaxy/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -174,14 +174,15 @@ module "saturn_compile" {
storage_public_name = google_storage_bucket.public.name
storage_secure_name = google_storage_bucket.secure.name

artifact_registry_name = var.artifact_registry_name

secret_id = google_secret_manager_secret.saturn.secret_id

network_vpc_id = google_compute_network.this.id
subnetwork_ip_cidr = "172.16.0.0/16"

machine_type = "e2-medium"
image = var.saturn_image
command = "compile"

max_instances = var.max_compile_instances
min_instances = 0
Expand All @@ -200,22 +201,23 @@ module "saturn_execute" {
storage_public_name = google_storage_bucket.public.name
storage_secure_name = google_storage_bucket.secure.name

artifact_registry_name = var.artifact_registry_name

secret_id = google_secret_manager_secret.saturn.secret_id

network_vpc_id = google_compute_network.this.id
subnetwork_ip_cidr = "172.17.0.0/16"

machine_type = "e2-highmem-2"
image = var.saturn_image
command = "execute"

max_instances = var.max_execute_instances
min_instances = 0
load_ratio = 10
}

resource "google_cloudbuild_trigger" "saturn" {
name = var.name
name = "${var.name}-saturn"

github {
owner = "battlecode"
Expand All @@ -229,7 +231,7 @@ resource "google_cloudbuild_trigger" "saturn" {
build {
step {
name = "gcr.io/cloud-builders/docker"
args = ["build", "--build-arg", "BUILD=$SHORT_SHA", "-t", var.saturn_image, "."]
args = ["build", "--build-arg", "REVISION_ARG=$TAG_NAME+$SHORT_SHA.$BUILD_ID", "-t", var.saturn_image, "."]
dir = "saturn"
}
step {
Expand All @@ -239,12 +241,12 @@ resource "google_cloudbuild_trigger" "saturn" {
step {
name = "gcr.io/google.com/cloudsdktool/cloud-sdk"
entrypoint = "gcloud"
args = ["compute", "instance-groups", "managed", "rolling-action", "replace", module.saturn_compile.compute_group_name, "--region", var.gcp_region]
args = ["compute", "instance-groups", "managed", "rolling-action", "replace", module.saturn_compile.compute_group_name, "--zone", var.gcp_zone]
}
step {
name = "gcr.io/google.com/cloudsdktool/cloud-sdk"
entrypoint = "gcloud"
args = ["compute", "instance-groups", "managed", "rolling-action", "replace", module.saturn_execute.compute_group_name, "--region", var.gcp_region]
args = ["compute", "instance-groups", "managed", "rolling-action", "replace", module.saturn_execute.compute_group_name, "--zone", var.gcp_zone]
}
}
}
5 changes: 5 additions & 0 deletions deploy/galaxy/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ variable "siarnaq_secrets" {
type = map
}

variable "artifact_registry_name" {
description = "Name of the Artifact Registry where the build image can be found"
type = string
}

variable "titan_image" {
description = "Image for the Titan Docker container"
type = string
Expand Down
4 changes: 4 additions & 0 deletions deploy/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ module "production" {
database_authorized_networks = []
siarnaq_secrets = merge(var.siarnaq_secrets_common, var.siarnaq_secrets_production)

artifact_registry_name = module.cd.artifact_registry_name

titan_image = module.cd.artifact_image["titan"]

saturn_image = module.cd.artifact_image["saturn"]
Expand Down Expand Up @@ -75,6 +77,8 @@ module "staging" {
database_authorized_networks = ["0.0.0.0/0"]
siarnaq_secrets = merge(var.siarnaq_secrets_common, var.siarnaq_secrets_staging)

artifact_registry_name = module.cd.artifact_registry_name

titan_image = module.cd.artifact_image["titan"]

saturn_image = module.cd.artifact_image["saturn"]
Expand Down
15 changes: 13 additions & 2 deletions deploy/saturn/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ resource "google_storage_bucket_iam_member" "secure" {
member = "serviceAccount:${google_service_account.this.email}"
}

resource "google_artifact_registry_repository_iam_member" "this" {
location = var.gcp_region
repository = var.artifact_registry_name
role = "roles/artifactregistry.reader"
member = "serviceAccount:${google_service_account.this.email}"
}

resource "google_pubsub_subscription" "queue" {
name = var.name
topic = var.pubsub_topic_name
Expand Down Expand Up @@ -55,7 +62,11 @@ module "container" {

container = {
image = var.image
args = [var.command]
args = [
"-project=${var.gcp_project}",
"-secret=${var.secret_id}",
"-subscription=${google_pubsub_subscription.queue.name}",
]
}
}

Expand Down Expand Up @@ -132,7 +143,7 @@ resource "google_compute_autoscaler" "this" {

metric {
name = "pubsub.googleapis.com/subscription/num_undelivered_messages"
filter = "resource.type = pubsub_subscription AND resource.label.subscription_id = \"${google_pubsub_subscription.queue.id}\""
filter = "resource.type = pubsub_subscription AND resource.label.subscription_id = \"${google_pubsub_subscription.queue.name}\""
single_instance_assignment = var.load_ratio
}
}
Expand Down
10 changes: 5 additions & 5 deletions deploy/saturn/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ variable "storage_secure_name" {
type = string
}

variable "artifact_registry_name" {
description = "Name of the Artifact Registry where the build image can be found"
type = string
}

variable "secret_id" {
description = "ID of the Secret resource"
type = string
Expand All @@ -58,11 +63,6 @@ variable "image" {
type = string
}

variable "command" {
description = "Arguments for the Docker entrypoint"
type = string
}

variable "max_instances" {
description = "Maximum allowable size of the worker pool"
type = number
Expand Down
24 changes: 24 additions & 0 deletions saturn/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
FROM golang:1.18-buster AS go

ENV BUILD_HOME /build
WORKDIR $BUILD_HOME

COPY go.mod go.sum ./
RUN go mod download

COPY . .
RUN GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o /saturn -ldflags="-s -w" ./cmd/saturn/main.go


FROM openjdk:8-slim-buster

ENV APP_HOME /app
WORKDIR $APP_HOME

ARG REVISION_ARG=nightly
ENV REVISION=$REVISION_ARG

EXPOSE 8005

COPY --from=go /saturn .
ENTRYPOINT ["./saturn"]
33 changes: 17 additions & 16 deletions saturn/cmd/saturn/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ package main

import (
"context"
"flag"
"fmt"
"os"
"os/signal"

"github.com/battlecode/galaxy/saturn/pkg/run"
Expand All @@ -11,41 +14,39 @@ import (
"golang.org/x/sys/unix"
)

const (
gcpProjectID = "mitbattlecode"

gcpSecretName = "staging-saturn"

gcpPubsubSubscriptionID = "staging-saturn-compile"
gcpTokenedReporterAudience = "siarnaq"
gcpTokenedReporterUserAgent = "Galaxy-Saturn"
monitorAddress = "127.0.0.1:8005"

scaffoldRoot = "/scaffolds"
var (
gcpProjectID *string = flag.String("project", os.Getenv("GCP_PROJECT"), "the project id on gcp")
gcpSecretName *string = flag.String("secret", "", "the name of the saturn secret")
gcpPubsubSubscriptionID *string = flag.String("subscription", "", "the name of the pubsub subscription")
gcpTokenedReporterAudience *string = flag.String("audience", "siarnaq", "the audience for gcp oidc tokens")
gcpTokenedReporterUserAgent *string = flag.String("useragent", "Galaxy-Saturn", "the user agent for reporting")
monitorPort *uint = flag.Uint("port", 8005, "the port for monitoring shutdowns")
scaffoldRoot *string = flag.String("scaffold", "/scaffolds", "the root directory for saving scaffolds")
)

func main() {
zerolog.DefaultContextLogger = &log.Logger
zerolog.LevelFieldName = "severity"
flag.Parse()

ctx, stop := signal.NotifyContext(context.Background(), unix.SIGINT, unix.SIGTERM)
defer stop()

secret, err := saturn.ReadSecret(ctx, gcpProjectID, gcpSecretName)
secret, err := saturn.ReadSecret(ctx, *gcpProjectID, *gcpSecretName)
if err != nil {
log.Ctx(ctx).Fatal().Err(err).Msg("Could not read secrets.")
}

multiplexer, err := run.NewScaffoldMultiplexer(scaffoldRoot, secret)
multiplexer, err := run.NewScaffoldMultiplexer(*scaffoldRoot, secret)
if err != nil {
log.Ctx(ctx).Fatal().Err(err).Msg("Could not initialize scaffold multiplexer.")
}

app, err := saturn.New(
ctx,
saturn.WithMonitor(monitorAddress),
saturn.WithGcpPubsubSubcriber(gcpProjectID, gcpPubsubSubscriptionID),
saturn.WithGcpTokenedReporter(gcpTokenedReporterAudience, gcpTokenedReporterUserAgent),
saturn.WithMonitor(fmt.Sprintf("127.0.0.1:%d", *monitorPort)),
saturn.WithGcpPubsubSubcriber(*gcpProjectID, *gcpPubsubSubscriptionID),
saturn.WithGcpTokenedReporter(*gcpTokenedReporterAudience, *gcpTokenedReporterUserAgent),
saturn.WithRunner("compile", multiplexer.Compile),
saturn.WithRunner("execute", multiplexer.Execute),
)
Expand Down
5 changes: 3 additions & 2 deletions saturn/pkg/run/java8.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ func NewJava8Scaffold(ctx context.Context, episode saturn.Episode, repo *git.Rep
s.UploadReplay(),
s.DetermineScores(),
}
s.matchOutputs = make(map[*StepArguments]string)
return s, nil
}

Expand Down Expand Up @@ -163,8 +164,8 @@ func (s *Java8Scaffold) RunMatch() *Step {
fmt.Sprintf("-PonSaturn=%t", true),
fmt.Sprintf("-PteamA=%s", arg.Details.(ExecuteRequest).A.TeamName),
fmt.Sprintf("-PteamB=%s", arg.Details.(ExecuteRequest).B.TeamName),
fmt.Sprintf("-PclassLocationA=%s", filepath.Join("data", "a")),
fmt.Sprintf("-PclassLocationB=%s", filepath.Join("data", "b")),
fmt.Sprintf("-PclassLocationA=%s", filepath.Join("data", "A")),
fmt.Sprintf("-PclassLocationB=%s", filepath.Join("data", "B")),
fmt.Sprintf("-PpackageNameA=%s", arg.Details.(ExecuteRequest).A.Package),
fmt.Sprintf("-PpackageNameB=%s", arg.Details.(ExecuteRequest).B.Package),
fmt.Sprintf("-Preplay=%s", filepath.Join("data", "replay.bin")),
Expand Down
2 changes: 1 addition & 1 deletion saturn/pkg/run/protocol.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ type Submission struct {
}

type CompileRequest struct {
Submission `mapstructure:"submission"`
Submission `mapstructure:"submission,squash"`
}

type ExecuteRequest struct {
Expand Down
2 changes: 1 addition & 1 deletion saturn/pkg/run/recipe.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ var StateVersion = Step{
Callable: func(ctx context.Context, arg *StepArguments) error {
log.Ctx(ctx).Debug().Msg("Welcome to Saturn!")
log.Ctx(ctx).Debug().Msgf("Node: %s", os.Getenv("HOSTNAME"))
log.Ctx(ctx).Debug().Msgf("Build: %s", os.Getenv("SATURN_BUILD"))
log.Ctx(ctx).Debug().Msgf("Revision: %s", os.Getenv("REVISION"))
return nil
},
}
1 change: 1 addition & 0 deletions saturn/pkg/saturn/queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ func NewGCPPubsubSubscriber(ctx context.Context, projectID, subscriptionID strin
func (c *GCPPubsubSubscriber) Subscribe(ctx context.Context, handler QueuedTaskHandler) error {
err := c.subscription.Receive(ctx, func(ctx context.Context, msg *pubsub.Message) {
defer msg.Nack()
log.Ctx(ctx).Info().Bytes("message", msg.Data).Msg("Starting task.")
var task TaskPayload
if err := json.Unmarshal(msg.Data, &task); err != nil {
log.Ctx(ctx).Error().Err(err).Msg("Invalid message.")
Expand Down
15 changes: 12 additions & 3 deletions saturn/pkg/saturn/report.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ import (
"context"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"

"github.com/rs/zerolog/log"
"google.golang.org/api/idtoken"
)

Expand Down Expand Up @@ -37,16 +39,17 @@ func (r *GCPTokenedReporter) Report(ctx context.Context, t *Task) error {
}
payload["invocation"] = map[string]interface{}{
"status": t.status.String(),
"logs": t.logs,
"logs": t.logs.String(),
"interrupted": t.status == TaskInterrupted,
}

body, err := json.Marshal(payload)
reqBody, err := json.Marshal(payload)
if err != nil {
return fmt.Errorf("json.Marshal: %v", err)
}
log.Ctx(ctx).Debug().RawJSON("payload", reqBody).Msg("Sending report.")

req, err := http.NewRequestWithContext(ctx, "POST", t.Payload.Metadata.ReportURL, bytes.NewBuffer(body))
req, err := http.NewRequest("POST", t.Payload.Metadata.ReportURL, bytes.NewBuffer(reqBody))
if err != nil {
return fmt.Errorf("http.NewRequestWithContext: %v", err)
}
Expand All @@ -59,6 +62,12 @@ func (r *GCPTokenedReporter) Report(ctx context.Context, t *Task) error {
}
defer resp.Body.Close()

respBody, err := ioutil.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("ioutil.ReadAll: %v", err)
}
log.Ctx(ctx).Debug().Bytes("response", respBody).Msg("Report sent.")

if resp.StatusCode == http.StatusConflict {
t.Finish(TaskAborted, nil)
}
Expand Down
Loading

0 comments on commit 67d2481

Please sign in to comment.