Initial version

adevinta · Sep 24, 2024 · e23780b · e23780b
commit e23780b
Show file tree

Hide file tree

Showing 86 changed files with 7,430 additions and 0 deletions.
diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml
@@ -0,0 +1,50 @@
+name: CI/CD Pipeline
+
+on:
+  push:
+    branches:
+      - '**'
+    tags-ignore:
+      - '**'
+
+env:
+  JAVA_VERSION: 21
+
+jobs:
+  test:
+    name: Build and test
+    runs-on: big
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Setup Java
+        uses: actions/setup-java@v3
+        with:
+          distribution: corretto
+          java-version: ${{ env.JAVA_VERSION }}
+          cache: sbt
+
+      - name: Setup Sbt
+        uses: sbt/setup-sbt@v1
+
+# Needed for the integration tests
+#      - name: Authenticate to Google Cloud
+#        uses: google-github-actions/auth@v1
+#        with:
+#          credentials_json: ${{ secrets.GCP_SERVICE_ACCOUNT_KEY }}
+#
+#      - name: Set up Cloud SDK
+#        uses: google-github-actions/setup-gcloud@v1
+#        with:
+#          project_id: pre-production-project-id
+
+      - name: Build and test
+        id: run_tests
+        env:
+          RUN_INTEGRATION_TESTS: false
+        run: |
+          set -o pipefail
+          sbt clean test
+          # sbt package bq-writer/docker:publish gcs-writer/docker:publish
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,11 @@
+.metals/
+.bloop/
+.ammonite/
+.idea/
+.vscode/
+.bsp/
+.scala-build
+
+target/
+
+.DS_Store
diff --git a/.scalafmt.conf b/.scalafmt.conf
@@ -0,0 +1,32 @@
+version = 3.1.1
+runner.dialect = "scala213"
+docstrings.wrap = "yes"
+trailingCommas = "keep"
+maxColumn = 100
+
+indent.extendSite = 4
+indent.withSiteRelativeToExtends = 2
+
+# SpaceAsterisk variant used to be called ScalaDoc
+# Use ScalaDoc style and enable wrapping when reaching `maxColumn`
+docstrings.style = "SpaceAsterisk"
+docstrings.wrap = yes
+docstrings.oneline = fold
+
+lineEndings = preserve
+
+# Avoid unnecessary break lines when a curly brace is put after a select (e.g., a `map`)
+includeCurlyBraceInSelectChains = false
+
+# Better readability for selects without apply (i.e., without `()`)
+includeNoParensInSelectChains = true
+
+# Ensure a separate line is created for each selector within a `{...}` import.
+rewrite.rules += Imports
+rewrite.imports.expand = true
+
+# Ensure code blocks inside markdown files get formatted too
+project.includePaths."+" = ["glob:**.md"]
+
+newlines.selectChains = keep
+
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) [year] [fullname]
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/bq-writer/src/main/resources/logback.xml b/bq-writer/src/main/resources/logback.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<configuration>
+    <appender name="json" class="ch.qos.logback.core.ConsoleAppender">
+        <encoder class="net.logstash.logback.encoder.LogstashEncoder">
+            <fieldNames>
+                <timestamp>timestamp</timestamp>
+                <version>[ignore]</version>
+                <logger>logger</logger>
+                <thread>[ignore]</thread>
+                <levelValue>[ignore]</levelValue>
+                <level>severity</level>
+            </fieldNames>
+        </encoder>
+    </appender>
+    <root level="WARN">
+        <appender-ref ref="json"/>
+    </root>
+</configuration>
diff --git a/bq-writer/src/main/scala/com/adevinta/bq/bqwriter/App.scala b/bq-writer/src/main/scala/com/adevinta/bq/bqwriter/App.scala
@@ -0,0 +1,64 @@
+package com.adevinta.bq.bqwriter
+
+import com.adevinta.bq.bqwriter.config.AppConfig
+import com.adevinta.bq.bqwriter.config.AppConfigLive
+import com.adevinta.bq.shared.remotedir.GcsRemoteDirectory
+import zio._
+import zio.logging.consoleJsonLogger
+import com.adevinta.zc.http.Http
+import com.adevinta.zc.metrics.MetricsLive
+
+object App extends ZIOAppDefault {
+
+  override val bootstrap: ZLayer[Any, Nothing, Unit] =
+    Runtime.removeDefaultLoggers >>>
+      Runtime.setUnhandledErrorLogLevel(LogLevel.Warning) >>>
+      consoleJsonLogger().orDie
+
+  def run: ZIO[Scope, Nothing, ExitCode] = {
+    val program =
+      logConfig *>
+        (Http.runBasicHttpServer <&>
+          ZIO.serviceWithZIO[JobCoordinator](_.runLoop()) <&>
+          ZIO.serviceWithZIO[StagingRepository](_.runDoneFileCleanup()))
+
+    program
+      .provideSome[Scope](
+        AppConfigLive.live,
+        ZLayer.fromFunction((_: AppConfig).httpConfig),
+        ZLayer.fromFunction((_: AppConfig).remoteDirectoryConfig),
+        ZLayer.fromFunction((_: AppConfig).jobsRepositoryConfig),
+        ZLayer.fromFunction((_: AppConfig).jobCoordinatorConfig),
+        ZLayer.fromFunction((_: AppConfig).bqConfig),
+        ZLayer.fromFunction((_: AppConfig).gcsRemoteDirectoryConfig),
+        MetricsLive.layer,
+        GcsRemoteDirectory.storageLayer,
+        GcsRemoteDirectory.layer,
+        BigQueryJobServiceLive.bigQueryLayer,
+        StagingRepositoryLive.layer,
+        BigQueryJobServiceLive.layer,
+        GcsJobsRepositoryLive.layer,
+        JobCoordinatorLive.layer,
+      )
+      .onError {
+        case cause if cause.isInterruptedOnly =>
+          ZIO.logWarning(s"Shutting down.")
+        case cause => ZIO.logErrorCause(s"Execution failed.", cause)
+      }
+      .exitCode <*
+      // Give logback some time to actually log before the JVM exits.
+      ZIO.sleep(100.millis)
+  }
+
+  private val logConfig =
+    for {
+      config <- ZIO.service[AppConfig]
+      procCount = java.lang.Runtime.getRuntime.availableProcessors()
+      _ <- ZIO.logWarning(
+        s"""Starting bq-writer
+           |Available processors: $procCount
+           |Configuration: $config""".stripMargin
+      )
+    } yield ()
+
+}
diff --git a/bq-writer/src/main/scala/com/adevinta/bq/bqwriter/AppError.scala b/bq-writer/src/main/scala/com/adevinta/bq/bqwriter/AppError.scala
@@ -0,0 +1,9 @@
+package com.adevinta.bq.bqwriter
+
+import scala.util.control.NoStackTrace
+
+sealed trait AppError extends NoStackTrace
+
+object AppError {
+  final case class JsonDecodingError(message: String) extends Throwable(message) with AppError
+}
diff --git a/bq-writer/src/main/scala/com/adevinta/bq/bqwriter/BigQueryJobService.scala b/bq-writer/src/main/scala/com/adevinta/bq/bqwriter/BigQueryJobService.scala
@@ -0,0 +1,172 @@
+package com.adevinta.bq.bqwriter
+
+import com.adevinta.bq.bqwriter.config.BigQueryConfig
+import com.adevinta.bq.bqwriter.job.BQJobId
+import com.adevinta.bq.bqwriter.job.BigQueryJob
+import com.adevinta.bq.bqwriter.job.JobError
+import com.adevinta.bq.bqwriter.job.JobStatus
+
+import java.time.Instant
+import scala.jdk.CollectionConverters._
+import com.google.cloud.bigquery.BigQuery
+import com.google.cloud.bigquery.BigQueryOptions
+import com.google.cloud.bigquery.FormatOptions
+import com.google.cloud.bigquery.JobId
+import com.google.cloud.bigquery.{Job => BQJob}
+import com.google.cloud.bigquery.{JobInfo => BQJobInfo}
+import com.google.cloud.bigquery.{JobStatus => BQJobStatus}
+import com.google.cloud.bigquery.{LoadJobConfiguration => BQLoadJobConfiguration}
+import com.google.cloud.bigquery.{TableId => BQTableId}
+import com.google.cloud.bigquery.{TimePartitioning => BQTimePartitioning}
+import com.google.common.collect.ImmutableList
+import zio._
+import com.adevinta.zc.metrics.Metrics
+
+/** Service that can submit and fetch job status from BigQuery. */
+trait BigQueryJobService {
+
+  /** Fetches the status of the given jobs from BigQuery, and returns these jobs with the updated
+    * status.
+    */
+  def jobsWithUpdatedStatus(
+      jobIdPrefix: String,
+      runningJobs: Chunk[BigQueryJob]
+  ): Task[Chunk[BigQueryJob]]
+
+  /* Submits new load jobs, returns the created jobs. */
+  def submitJobs(newJobs: Chunk[BigQueryJob]): ZIO[Any, Throwable, Unit]
+}
+
+object BigQueryJobServiceLive {
+
+  /** Gives access to underlying BigQuery API. */
+  val bigQueryLayer: ZLayer[BigQueryConfig, Throwable, BigQuery] = ZLayer {
+    for {
+      config <- ZIO.service[BigQueryConfig]
+      credentials <- config.gcpCredentials.credentials
+      bigquery <- ZIO.attemptBlocking(
+        BigQueryOptions
+          .newBuilder()
+          .setProjectId(config.projectIdString)
+          .setLocation(config.location)
+          .setCredentials(credentials)
+          .build()
+          .getService
+      )
+    } yield bigquery
+  }
+
+  private type Dependencies = BigQueryConfig with BigQuery with JobsRepository with Metrics
+
+  val layer: ZLayer[Dependencies, Nothing, BigQueryJobService] =
+    ZLayer {
+      for {
+        config <- ZIO.service[BigQueryConfig]
+        jobsRepository <- ZIO.service[JobsRepository]
+        bigQuery <- ZIO.service[BigQuery]
+      } yield {
+        BigQueryJobServiceLive(
+          config,
+          jobsRepository,
+          bigQuery
+        )
+      }
+    }
+}
+
+final case class BigQueryJobServiceLive(
+    config: BigQueryConfig,
+    jobsRepository: JobsRepository,
+    bigQuery: BigQuery
+) extends BigQueryJobService {
+
+  override def jobsWithUpdatedStatus(
+      jobIdPrefix: String,
+      runningJobs: Chunk[BigQueryJob]
+  ): Task[Chunk[BigQueryJob]] =
+    if (runningJobs.isEmpty) ZIO.succeed(runningJobs)
+    else {
+      val creationTimeOfOldestJob = runningJobs.map(_.creationTime).min
+      for {
+        jobsStatuses <- fetchJobsStatuses(jobIdPrefix, creationTimeOfOldestJob)
+        _ <- ZIO.foreachDiscard(jobsStatuses) { case (jobId, jobStatus) =>
+          jobStatus match {
+            case JobStatus.Failed(jobError) =>
+              ZIO.logWarning(s"Job $jobId failed with error $jobError")
+            case _ => ZIO.unit
+          }
+        }
+      } yield {
+        runningJobs.map { job =>
+          jobsStatuses.get(job.jobId).fold(job)(status => job.copy(jobStatus = status))
+        }
+      }
+    }
+
+  override def submitJobs(newJobs: Chunk[BigQueryJob]): ZIO[Any, Throwable, Unit] =
+    ZIO
+      .foreachDiscard(newJobs) { newJob =>
+        ZIO.attemptBlocking {
+          val jobInfo = BQJobInfo
+            .newBuilder(makeLoadJobConfiguration(newJob))
+            .setJobId(JobId.of(config.projectIdString, newJob.jobUId))
+            .build()
+
+          // Creates BQ Job
+          bigQuery.create(jobInfo)
+        }
+      }
+      .unit
+
+  /** Fetches the job status from BigQuery. */
+  // package private access for testing
+  private[bqwriter] def fetchJobsStatuses(
+      jobPrefix: String,
+      creationTimeOfOldestJob: Instant,
+  ): Task[Map[BQJobId, JobStatus]] =
+    ZIO.attemptBlocking {
+      bigQuery
+        .listJobs(
+          BigQuery.JobListOption.minCreationTime(creationTimeOfOldestJob.toEpochMilli),
+          BigQuery.JobListOption.fields(BigQuery.JobField.ID, BigQuery.JobField.STATUS),
+        )
+        .iterateAll()
+        .asScala
+        .filter(_.getJobId.getJob.startsWith(jobPrefix))
+        .map(j => BQJobId(j.getJobId.getJob) -> getJobStatus(j))
+        .toMap
+    }
+      .retry(Schedule.fibonacci(100.milli) && Schedule.recurs(3))
+
+  private def getJobStatus(job: BQJob): JobStatus =
+    job.getStatus.getState match {
+      case BQJobStatus.State.RUNNING => JobStatus.Running
+      case BQJobStatus.State.PENDING => JobStatus.Pending
+      case BQJobStatus.State.DONE =>
+        Option(job.getStatus.getError)
+          .fold[JobStatus](JobStatus.Success)(e => JobStatus.Failed(JobError(e)))
+    }
+
+  private[bqwriter] def makeLoadJobConfiguration(job: BigQueryJob): BQLoadJobConfiguration = {
+    val destinationTableId: BQTableId = BQTableId.of(job.tableId.datasetName, job.tableId.tableName)
+    val gcsFileUris = job.fileRefs.map(_.uri).asJava
+    BQLoadJobConfiguration
+      .newBuilder(destinationTableId, gcsFileUris)
+      .setFormatOptions(FormatOptions.avro)
+      .setUseAvroLogicalTypes(true)
+      .setSchemaUpdateOptions(
+        ImmutableList.of(
+          BQJobInfo.SchemaUpdateOption.ALLOW_FIELD_ADDITION,
+          BQJobInfo.SchemaUpdateOption.ALLOW_FIELD_RELAXATION,
+        )
+      )
+      .setTimePartitioning(
+        BQTimePartitioning
+          .newBuilder(BQTimePartitioning.Type.DAY)
+          .setField("eventDate")
+          .setExpirationMs(config.partitionRetentionInDays.days.toMillis)
+          .build()
+      )
+      .build()
+  }
+}