Skip to content

Commit

Permalink
Rewrite BigQuery Loader from scratch
Browse files Browse the repository at this point in the history
  • Loading branch information
istreeter committed Jan 8, 2024
0 parents commit 6566ddd
Show file tree
Hide file tree
Showing 45 changed files with 3,351 additions and 0 deletions.
104 changes: 104 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
name: CI

on:
push:
tags:
- '*'
pull_request:

jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: coursier/cache-action@v6
- name: Set up JDK 11
uses: actions/setup-java@v1
with:
java-version: 11
- name: Check Scala formatting
run: sbt scalafmtCheckAll scalafmtSbtCheck
- name: Run tests
run: sbt test

publish_docker:
needs: test
if: github.ref_type == 'tag'
runs-on: ubuntu-latest
strategy:
matrix:
app:
- kafka
- pubsub
- kinesis
steps:
- name: Checkout Github
uses: actions/checkout@v2
- uses: coursier/cache-action@v6
- name: Set up JDK 11 for loader and streaming transformer
uses: actions/setup-java@v1
with:
java-version: 11
- name: Docker login
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Stage the Docker build
run: sbt "project ${{ matrix.app}}" docker:stage
- name: Stage the Docker Distroless build
run: sbt "project ${{ matrix.app}}Distroless" docker:stage
- name: Docker metadata
id: meta
uses: docker/metadata-action@v3
with:
images: "snowplow/bigquery-loader-${{ matrix.app }}"
tags: |
type=raw,value=latest,enable=${{ !contains(github.ref_name, 'rc') }}
type=raw,value=latest-focal,enable=${{ !contains(github.ref_name, 'rc') }}
type=raw,value=${{ github.ref_name }}
type=raw,value=${{ github.ref_name }}-focal
flavor: |
latest=false
- name: Docker metadata distroless
id: metaDistroless
uses: docker/metadata-action@v3
with:
images: "snowplow/bigquery-loader-${{ matrix.app }}"
tags: |
type=raw,value=latest-distroless,enable=${{ !contains(github.ref_name, 'rc') }}
type=raw,value=${{ github.ref_name }}-distroless
flavor: |
latest=false
- name: Set up QEMU
uses: docker/setup-qemu-action@v1
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Push image
uses: docker/build-push-action@v2
with:
context: modules/${{ matrix.app }}/target/docker/stage
file: modules/${{ matrix.app }}/target/docker/stage/Dockerfile
platforms: linux/amd64,linux/arm64/v8
tags: ${{ steps.meta.outputs.tags }}
push: true
- name: Push distroless image
uses: docker/build-push-action@v2
with:
context: modules/distroless/${{ matrix.app }}/target/docker/stage
file: modules/distroless/${{ matrix.app }}/target/docker/stage/Dockerfile
platforms: linux/amd64,linux/arm64/v8
tags: ${{ steps.metaDistroless.outputs.tags }}
push: true
- name: Build local image, which is needed to run Snyk
# if: ${{ !contains(github.ref_name, 'rc') }} # TODO: uncomment before final release
run: sbt "project ${{ matrix.app }}Distroless" docker:publishLocal
- name: Run Snyk to check for vulnerabilities
uses: snyk/actions/docker@master
# if: ${{ !contains(github.ref_name, 'rc') }} # TODO: uncomment before final release
with:
image: "snowplow/bigquery-loader-${{ matrix.app }}:${{ github.ref_name }}-distroless"
args: "--app-vulns --org=99605b41-ca0f-42c9-a9ff-45c201a10a26"
command: monitor
env:
SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }}
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
target/
lib/
50 changes: 50 additions & 0 deletions .scalafmt.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
version = "3.6.0"
runner.dialect = scala213
align.preset = none
align.openParenCallSite = false
align.arrowEnumeratorGenerator = true
align.tokens = [
{
code = "=>"
owners = [{
regex = "Case"
}]
},
{
code = "="
owners = []
},
{
code = "%"
owners = [{
regex = "Term.ApplyInfix"
}]
},
{
code = "%%"
owners = [{
regex = "Term.ApplyInfix"
}]
}
]
maxColumn = 140
docstrings.style = Asterisk
docstrings.wrap = yes
docstrings.wrapMaxColumn = 100
optIn.breakChainOnFirstMethodDot = true
spaces.afterKeywordBeforeParen = true
indent.callSite = 2
indent.defnSite = 2
verticalMultiline.atDefnSite = true
verticalMultiline.arityThreshold = 3
verticalMultiline.newlineAfterOpenParen = true
danglingParentheses.defnSite = true
danglingParentheses.exclude = []
importSelectors = noBinPack
rewrite.rules = [
Imports,
RedundantBraces,
RedundantParens,
PreferCurlyFors
]
rewrite.imports.sort = ascii
57 changes: 57 additions & 0 deletions LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Snowplow Limited Use License Agreement

_Version 1.0, January 2024_

This Snowplow Limited Use License Agreement, Version 1.0 (the “Agreement”) sets forth the terms on which Snowplow Analytics, Ltd. (“Snowplow”) makes available certain software (the “Software”). BY INSTALLING, DOWNLOADING, ACCESSING, OR USING ANY OF THE SOFTWARE, YOU AGREE TO THE TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE TO SUCH TERMS AND CONDITIONS, YOU MUST NOT USE THE SOFTWARE. IF YOU ARE RECEIVING THE SOFTWARE ON BEHALF OF A LEGAL ENTITY, YOU REPRESENT AND WARRANT THAT YOU HAVE THE ACTUAL AUTHORITY TO AGREE TO THE TERMS AND CONDITIONS OF THIS AGREEMENT ON BEHALF OF SUCH ENTITY. “Licensee” means you, an individual, or the entity on whose behalf you are receiving the Software.

## LICENSE GRANT AND CONDITIONS

**1.1 License.** Subject to the terms and conditions of this Agreement, Snowplow hereby grants to Licensee a non-exclusive, royalty-free, worldwide, non-transferable, non-sublicensable license during the term of this Agreement to: (a) use the Software; (b) prepare modifications and derivative works of the Software; and (c) reproduce copies of the Software (the “License”). No right to distribute or make available the Software is granted under this License. Licensee is not granted the right to, and Licensee shall not, exercise the License for any Excluded Purpose.

**1.2** For purposes of this Agreement, an “Excluded Purpose” is any use that is either a Competing Use or a Highly-Available Production Use, or both of them.

* **1.2.1** A “Competing Use” is making available any on-premises or distributed software product, or any software-as-a-service, platform-as-a-service, infrastructure-as-a-service, or other similar online service, that competes with any products or services that Snowplow or any of its affiliates provides using the Software.

* **1.2.2** Highly-Available Production Use is any highly-available use, including without limitation any use where multiple instances of any Software component run concurrently to avoid a single point of failure, in a production environment, where production means use on live data.

**1.3 Conditions.** In consideration of the License, Licensee’s use of the Software is subject to the following conditions:

* **a.** Licensee must cause any Software modified by Licensee to carry prominent notices stating that Licensee modified the Software.

* **b.** On each Software copy, Licensee shall reproduce and not remove or alter all Snowplow or third party copyright or other proprietary notices contained in the Software, and Licensee must include the notice below on each copy.

```
This software is made available by Snowplow Analytics, Ltd.,
under the terms of the Snowplow Limited Use License Agreement, Version 1.0
located at https://docs.snowplow.io/limited-use-license-1.0
BY INSTALLING, DOWNLOADING, ACCESSING, USING OR DISTRIBUTING ANY PORTION
OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT.
```

**1.4 Licensee Modifications.** Licensee may add its own copyright notices to modifications made by Licensee.

**1.5 No Sublicensing.** The License does not include the right to sublicense the Software, however, each recipient to which Licensee provides the Software may exercise the Licenses so long as such recipient agrees to the terms and conditions of this Agreement.

## TERM AND TERMINATION

This Agreement will continue unless and until earlier terminated as set forth herein. If Licensee breaches any of its conditions or obligations under this Agreement, this Agreement will terminate automatically and the License will terminate automatically and permanently.

## INTELLECTUAL PROPERTY

As between the parties, Snowplow will retain all right, title, and interest in the Software, and all intellectual property rights therein. Snowplow hereby reserves all rights not expressly granted to Licensee in this Agreement. Snowplow hereby reserves all rights in its trademarks and service marks, and no licenses therein are granted in this Agreement.

## DISCLAIMER

SNOWPLOW HEREBY DISCLAIMS ANY AND ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, AND SPECIFICALLY DISCLAIMS ANY WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, WITH RESPECT TO THE SOFTWARE.

## LIMITATION OF LIABILITY

SNOWPLOW WILL NOT BE LIABLE FOR ANY DAMAGES OF ANY KIND, INCLUDING BUT NOT LIMITED TO LOST PROFITS OR ANY CONSEQUENTIAL, SPECIAL, INCIDENTAL, INDIRECT, OR DIRECT DAMAGES, HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ARISING OUT OF THIS AGREEMENT. THE FOREGOING SHALL APPLY TO THE EXTENT PERMITTED BY APPLICABLE LAW.

## GENERAL

**6.1 Governing Law.** This Agreement will be governed by and interpreted in accordance with the laws of the state of Delaware, without reference to its conflict of laws principles. If Licensee is located within the United States, all disputes arising out of this Agreement are subject to the exclusive jurisdiction of courts located in Delaware, USA. If Licensee is located outside of the United States, any dispute, controversy or claim arising out of or relating to this Agreement will be referred to and finally determined by arbitration in accordance with the JAMS International Arbitration Rules. The tribunal will consist of one arbitrator. The place of arbitration will be in the State of Delaware, USA. The language to be used in the arbitral proceedings will be English. Judgment upon the award rendered by the arbitrator may be entered in any court having jurisdiction thereof.

**6.2. Assignment.** Licensee is not authorized to assign its rights under this Agreement to any third party. Snowplow may freely assign its rights under this Agreement to any third party.

**6.3. Other.** This Agreement is the entire agreement between the parties regarding the subject matter hereof. No amendment or modification of this Agreement will be valid or binding upon the parties unless made in writing and signed by the duly authorized representatives of both parties. In the event that any provision, including without limitation any condition, of this Agreement is held to be unenforceable, this Agreement and all licenses and rights granted hereunder will immediately terminate. Waiver by Snowplow of a breach of any provision of this Agreement or the failure by Snowplow to exercise any right hereunder will not be construed as a waiver of any subsequent breach of that right or as a waiver of any other right.
80 changes: 80 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Snowplow Bigquery Loader

[![Build Status][build-image]][build]
[![Release][release-image]][releases]
[![License][license-image]][license]

## Introduction

This project contains applications required to load Snowplow data into Bigquery with low latency.

Check out [the example config files](./config) for how to configure your loader.

#### Azure

The Azure bigquery loader reads the stream of enriched events from Event Hubs.

Basic usage:
`
```bash
docker run \
-v /path/to/config.hocon:/var/config.hocon \
snowplow/bigquery-loader-kafka:2.0.0 \
--config /var/config.hocon \
--iglu-config /var/iglu.json
```

#### GCP

The GCP bigquery loader reads the stream of enriched events from Pubsub.

```bash
docker run \
-v /path/to/config.hocon:/var/config.hocon \
snowplow/bigquery-loader-pubsub:2.0.0 \
--config /var/config.hocon \
--iglu-config /var/iglu.json
```

#### AWS

The AWS bigquery loader reads the stream of enriched events from Kinesis.

```bash
docker run \
-v /path/to/config.hocon:/var/config.hocon \
snowplow/bigquery-loader-kinesis:2.0.0 \
--config /var/config.hocon \
--iglu-config /var/iglu.json
```

## Find out more

| Technical Docs | Setup Guide | Roadmap & Contributing |
|----------------------------|----------------------|------------------------|
| ![i1][techdocs-image] | ![i2][setup-image] | ![i3][roadmap-image] |
| [Technical Docs][techdocs] | [Setup Guide][setup] | [Roadmap][roadmap] |



## Copyright and License

Copyright (c) 2012-present Snowplow Analytics Ltd. All rights reserved.

Licensed under the [Snowplow Limited Use License Agreement][license]. _(If you are uncertain how it applies to your use case, check our answers to [frequently asked questions][faq].)_

[techdocs-image]: https://d3i6fms1cm1j0i.cloudfront.net/github/images/techdocs.png
[setup-image]: https://d3i6fms1cm1j0i.cloudfront.net/github/images/setup.png
[roadmap-image]: https://d3i6fms1cm1j0i.cloudfront.net/github/images/roadmap.png
[setup]: https://docs.snowplow.io/docs/getting-started-on-snowplow-open-source/
[techdocs]: https://docs.snowplow.io/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/
[roadmap]: https://github.com/snowplow/snowplow/projects/7

[build-image]: https://github.com/snowplow-incubator/snowplow-bigquery-loader/workflows/CI/badge.svg
[build]: https://github.com/snowplow-incubator/snowplow-bigquery-loader/actions/workflows/ci.yml

[release-image]: https://img.shields.io/badge/release-2.0.0-blue.svg?style=flat
[releases]: https://github.com/snowplow-incubator/snowplow-biguery-loader/releases

[license]: https://docs.snowplow.io/limited-use-license-1.0
[license-image]: https://img.shields.io/badge/license-Snowplow--Limited-Use-blue.svg?style=flat
73 changes: 73 additions & 0 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/**
* Copyright (c) 2013-present Snowplow Analytics Ltd. All rights reserved.
*
* This software is made available by Snowplow Analytics, Ltd., under the terms of the Snowplow
* Limited Use License Agreement, Version 1.0 located at
* https://docs.snowplow.io/limited-use-license-1.0 BY INSTALLING, DOWNLOADING, ACCESSING, USING OR
* DISTRIBUTING ANY PORTION OF THE SOFTWARE, YOU AGREE TO THE TERMS OF SUCH LICENSE AGREEMENT.
*/

lazy val root = project
.in(file("."))
.aggregate(
core,
kafka,
kafkaDistroless,
pubsub,
pubsubDistroless,
kinesis,
kinesisDistroless
)

lazy val core: Project = project
.in(file("modules/core"))
.settings(BuildSettings.coreSettings)
.settings(libraryDependencies ++= Dependencies.coreDependencies)
.enablePlugins(IgluSchemaPlugin)

lazy val kafka: Project = project
.in(file("modules/kafka"))
.settings(BuildSettings.kafkaSettings)
.settings(libraryDependencies ++= Dependencies.kafkaDependencies)
.dependsOn(core)
.enablePlugins(BuildInfoPlugin, JavaAppPackaging, SnowplowDockerPlugin)

lazy val kafkaDistroless: Project = project
.in(file("modules/distroless/kafka"))
.settings(BuildSettings.kafkaSettings)
.settings(libraryDependencies ++= Dependencies.kafkaDependencies)
.settings(sourceDirectory := (kafka / sourceDirectory).value)
.dependsOn(core)
.enablePlugins(BuildInfoPlugin, JavaAppPackaging, SnowplowDistrolessDockerPlugin)

lazy val pubsub: Project = project
.in(file("modules/pubsub"))
.settings(BuildSettings.pubsubSettings)
.settings(libraryDependencies ++= Dependencies.pubsubDependencies)
.dependsOn(core)
.enablePlugins(BuildInfoPlugin, JavaAppPackaging, SnowplowDockerPlugin)

lazy val pubsubDistroless: Project = project
.in(file("modules/distroless/pubsub"))
.settings(BuildSettings.pubsubSettings)
.settings(libraryDependencies ++= Dependencies.pubsubDependencies)
.settings(sourceDirectory := (pubsub / sourceDirectory).value)
.dependsOn(core)
.enablePlugins(BuildInfoPlugin, JavaAppPackaging, SnowplowDistrolessDockerPlugin)

lazy val kinesis: Project = project
.in(file("modules/kinesis"))
.settings(BuildSettings.kinesisSettings)
.settings(libraryDependencies ++= Dependencies.kinesisDependencies)
.dependsOn(core)
.enablePlugins(BuildInfoPlugin, JavaAppPackaging, SnowplowDockerPlugin)

lazy val kinesisDistroless: Project = project
.in(file("modules/distroless/kinesis"))
.settings(BuildSettings.kinesisSettings)
.settings(libraryDependencies ++= Dependencies.kinesisDependencies)
.settings(sourceDirectory := (kinesis / sourceDirectory).value)
.dependsOn(core)
.enablePlugins(BuildInfoPlugin, JavaAppPackaging, SnowplowDistrolessDockerPlugin)

ThisBuild / fork := true
23 changes: 23 additions & 0 deletions config/config.azure.minimal.hocon
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"license" {
"accept": true
}

"input": {
"topicName": "sp-dev-enriched"
"bootstrapServers": "localhost:9092"
}

"output": {

"good": {
"project": "my-project"
"dataset": "snowplow"
}

"bad": {
"topicName": "sp-dev-bad"
"bootstrapServers": "localhost:9092"
}
}
}
Loading

0 comments on commit 6566ddd

Please sign in to comment.