diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..aa77d404 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.idea/ +.DS_Store +.vscode/ +.vendor-new/ + +vendor/ +node_modules/ diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 00000000..e2415477 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,3 @@ +# These owners will be the default owners for everything in +# the repo. Unless a later match takes precedence. +* @anandswaminathan @kumare3 @mwylde diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..4c3a38cc --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,2 @@ +This project is governed by [Lyft's code of conduct](https://github.com/lyft/code-of-conduct). +All contributors and participants agree to abide by its terms. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..cd6b80cd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,29 @@ +# Using go1.10.4 +FROM golang:1.10.4-alpine3.8 as builder +RUN apk add git openssh-client make curl bash + +COPY boilerplate/lyft/golang_test_targets/dep_install.sh /go/src/github.com/lyft/flinkk8soperator/ + +# COPY only the dep files for efficient caching +COPY Gopkg.* /go/src/github.com/lyft/flinkk8soperator/ +WORKDIR /go/src/github.com/lyft/flinkk8soperator + +# Pull dependencies +RUN : \ + && sh dep_install.sh \ + && dep ensure -vendor-only + +# COPY the rest of the source code +COPY . /go/src/github.com/lyft/flinkk8soperator/ + +# This 'linux_compile' target should compile binaries to the /artifacts directory +# The main entrypoint should be compiled to /artifacts/flinkk8soperator +RUN make linux_compile + +# update the PATH to include the /artifacts directory +ENV PATH="/artifacts:${PATH}" + +# This will eventually move to centurylink/ca-certs:latest for minimum possible image size +FROM alpine:3.8 +COPY --from=builder /artifacts /bin +CMD ["flinkoperator"] diff --git a/Gopkg.lock b/Gopkg.lock new file mode 100644 index 00000000..7f8529c8 --- /dev/null +++ b/Gopkg.lock @@ -0,0 +1,1088 @@ +# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'. + + +[[projects]] + digest = "1:c0952fb3cf9506cff577b4edf4458889570dcbd2902a7b90a1fd96bfbb97ccd8" + name = "github.com/PuerkitoBio/purell" + packages = ["."] + pruneopts = "" + revision = "44968752391892e1b0d0b821ee79e9a85fa13049" + version = "v1.1.1" + +[[projects]] + branch = "master" + digest = "1:331a419049c2be691e5ba1d24342fc77c7e767a80c666a18fd8a9f7b82419c1c" + name = "github.com/PuerkitoBio/urlesc" + packages = ["."] + pruneopts = "" + revision = "de5bf2ad457846296e2031421a34e2568e304e35" + +[[projects]] + branch = "master" + digest = "1:a74730e052a45a3fab1d310fdef2ec17ae3d6af16228421e238320846f2aaec8" + name = "github.com/alecthomas/template" + packages = [ + ".", + "parse", + ] + pruneopts = "" + revision = "a0175ee3bccc567396460bf5acd36800cb10c49c" + +[[projects]] + branch = "master" + digest = "1:8483994d21404c8a1d489f6be756e25bfccd3b45d65821f25695577791a08e68" + name = "github.com/alecthomas/units" + packages = ["."] + pruneopts = "" + revision = "2efee857e7cfd4f3d0138cc3cbb1b4966962b93a" + +[[projects]] + digest = "1:0d3deb8a6da8ffba5635d6fb1d2144662200def6c9d82a35a6d05d6c2d4a48f9" + name = "github.com/beorn7/perks" + packages = ["quantile"] + pruneopts = "" + revision = "4b2b341e8d7715fae06375aa633dbb6e91b3fb46" + version = "v1.0.0" + +[[projects]] + digest = "1:0deddd908b6b4b768cfc272c16ee61e7088a60f7fe2f06c547bd3d8e1f8b8e77" + name = "github.com/davecgh/go-spew" + packages = ["spew"] + pruneopts = "" + revision = "8991bc29aa16c548c550c7ff78260e27b9ab7c73" + version = "v1.1.1" + +[[projects]] + digest = "1:441b1f7b6b05288262516e5ccdf302c8142577aff71e41ffc3b0afa54a42fd7c" + name = "github.com/emicklei/go-restful" + packages = [ + ".", + "log", + ] + pruneopts = "" + revision = "103c9496ad8f7e687b8291b56750190012091a96" + version = "v2.9.4" + +[[projects]] + digest = "1:e988ed0ca0d81f4d28772760c02ee95084961311291bdfefc1b04617c178b722" + name = "github.com/fatih/color" + packages = ["."] + pruneopts = "" + revision = "5b77d2a35fb0ede96d138fc9a99f5c9b6aef11b4" + version = "v1.7.0" + +[[projects]] + branch = "master" + digest = "1:135223bf2c128b2158178ee48779ac9983b003634864d46b73e913c95f7a847e" + name = "github.com/fsnotify/fsnotify" + packages = ["."] + pruneopts = "" + revision = "1485a34d5d5723fea214f5710708e19a831720e4" + +[[projects]] + digest = "1:b13707423743d41665fd23f0c36b2f37bb49c30e94adb813319c44188a51ba22" + name = "github.com/ghodss/yaml" + packages = ["."] + pruneopts = "" + revision = "0ca9ea5df5451ffdf184b4428c902747c2c11cd7" + version = "v1.0.0" + +[[projects]] + digest = "1:65587005c6fa4293c0b8a2e457e689df7fda48cc5e1f5449ea2c1e7784551558" + name = "github.com/go-logr/logr" + packages = ["."] + pruneopts = "" + revision = "9fb12b3b21c5415d16ac18dc5cd42c1cfdd40c4e" + version = "v0.1.0" + +[[projects]] + digest = "1:d81dfed1aa731d8e4a45d87154ec15ef18da2aa80fa9a2f95bec38577a244a99" + name = "github.com/go-logr/zapr" + packages = ["."] + pruneopts = "" + revision = "03f06a783fbb7dfaf3f629c7825480e43a7105e6" + version = "v0.1.1" + +[[projects]] + digest = "1:c8052dcf3ec378a9a6bc4f00ecc10d6d5eb3cc1f8faaf6b2f70f047e8881d446" + name = "github.com/go-openapi/jsonpointer" + packages = ["."] + pruneopts = "" + revision = "ef5f0afec364d3b9396b7b77b43dbe26bf1f8004" + version = "v0.19.0" + +[[projects]] + digest = "1:1824e5330b35b2a2418d06aa55629cc59ad454b72e338aa125ba8ff98f16298b" + name = "github.com/go-openapi/jsonreference" + packages = ["."] + pruneopts = "" + revision = "8483a886a90412cd6858df4ea3483dce9c8e35a3" + version = "v0.19.0" + +[[projects]] + digest = "1:22359833c00982fb26c61ea501eabbad401de8d811ef5ffc16deddb1e43a21ae" + name = "github.com/go-openapi/spec" + packages = ["."] + pruneopts = "" + revision = "53d776530bf78a11b03a7b52dd8a083086b045e5" + version = "v0.19.0" + +[[projects]] + digest = "1:cc03d3134465b9834f70ae1d08be632e4179310d093db62dc1a58b397d10f554" + name = "github.com/go-openapi/swag" + packages = ["."] + pruneopts = "" + revision = "b3e2804c8535ee0d1b89320afd98474d5b8e9e3b" + version = "v0.19.0" + +[[projects]] + digest = "1:218d6d3952eb109dd9001ef0249e2e5f7881e5b06b5700a072f9649dc6df2778" + name = "github.com/go-resty/resty" + packages = ["."] + pruneopts = "" + revision = "fa5875c0caa5c260ab78acec5a244215a730247f" + version = "v1.12.0" + +[[projects]] + digest = "1:fd53b471edb4c28c7d297f617f4da0d33402755f58d6301e7ca1197ef0a90937" + name = "github.com/gogo/protobuf" + packages = [ + "proto", + "sortkeys", + ] + pruneopts = "" + revision = "ba06b47c162d49f2af050fb4c75bcbc86a159d5c" + version = "v1.2.1" + +[[projects]] + branch = "master" + digest = "1:107b233e45174dbab5b1324201d092ea9448e58243ab9f039e4c0f332e121e3a" + name = "github.com/golang/glog" + packages = ["."] + pruneopts = "" + revision = "23def4e6c14b4da8ac2ed8007337bc5eb5007998" + +[[projects]] + branch = "master" + digest = "1:f9714c0c017f2b821bccceeec2c7a93d29638346bb546c36ca5f90e751f91b9e" + name = "github.com/golang/groupcache" + packages = ["lru"] + pruneopts = "" + revision = "5b532d6fd5efaf7fa130d4e859a2fde0fc3a9e1b" + +[[projects]] + digest = "1:529d738b7976c3848cae5cf3a8036440166835e389c1f617af701eeb12a0518d" + name = "github.com/golang/protobuf" + packages = [ + "proto", + "ptypes", + "ptypes/any", + "ptypes/duration", + "ptypes/timestamp", + ] + pruneopts = "" + revision = "b5d812f8a3706043e23a9cd5babf2e5423744d30" + version = "v1.3.1" + +[[projects]] + digest = "1:1e5b1e14524ed08301977b7b8e10c719ed853cbf3f24ecb66fae783a46f207a6" + name = "github.com/google/btree" + packages = ["."] + pruneopts = "" + revision = "4030bb1f1f0c35b30ca7009e9ebd06849dd45306" + version = "v1.0.0" + +[[projects]] + digest = "1:8d4a577a9643f713c25a32151c0f26af7228b4b97a219b5ddb7fd38d16f6e673" + name = "github.com/google/gofuzz" + packages = ["."] + pruneopts = "" + revision = "f140a6486e521aad38f5917de355cbf147cc0496" + version = "v1.0.0" + +[[projects]] + digest = "1:ad92aa49f34cbc3546063c7eb2cabb55ee2278b72842eda80e2a20a8a06a8d73" + name = "github.com/google/uuid" + packages = ["."] + pruneopts = "" + revision = "0cd6bf5da1e1c83f8b45653022c74f71af0538a4" + version = "v1.1.1" + +[[projects]] + digest = "1:16b2837c8b3cf045fa2cdc82af0cf78b19582701394484ae76b2c3bc3c99ad73" + name = "github.com/googleapis/gnostic" + packages = [ + "OpenAPIv2", + "compiler", + "extensions", + ] + pruneopts = "" + revision = "7c663266750e7d82587642f65e60bc4083f1f84e" + version = "v0.2.0" + +[[projects]] + branch = "master" + digest = "1:326d7083af3723768cd8150db99b8ac730837b05ef290d5a042562905cc26210" + name = "github.com/gregjones/httpcache" + packages = [ + ".", + "diskcache", + ] + pruneopts = "" + revision = "3befbb6ad0cc97d4c25d851e9528915809e1a22f" + +[[projects]] + digest = "1:85f8f8d390a03287a563e215ea6bd0610c858042731a8b42062435a0dcbc485f" + name = "github.com/hashicorp/golang-lru" + packages = [ + ".", + "simplelru", + ] + pruneopts = "" + revision = "7087cb70de9f7a8bc0a10c375cb0d2280a8edf9c" + version = "v0.5.1" + +[[projects]] + digest = "1:d14365c51dd1d34d5c79833ec91413bfbb166be978724f15701e17080dc06dec" + name = "github.com/hashicorp/hcl" + packages = [ + ".", + "hcl/ast", + "hcl/parser", + "hcl/printer", + "hcl/scanner", + "hcl/strconv", + "hcl/token", + "json/parser", + "json/scanner", + "json/token", + ] + pruneopts = "" + revision = "8cb6e5b959231cc1119e43259c4a608f9c51a241" + version = "v1.0.0" + +[[projects]] + digest = "1:31bfd110d31505e9ffbc9478e31773bf05bf02adcaeb9b139af42684f9294c13" + name = "github.com/imdario/mergo" + packages = ["."] + pruneopts = "" + revision = "7c29201646fa3de8506f701213473dd407f19646" + version = "v0.3.7" + +[[projects]] + digest = "1:870d441fe217b8e689d7949fef6e43efbc787e50f200cb1e70dbca9204a1d6be" + name = "github.com/inconshreveable/mousetrap" + packages = ["."] + pruneopts = "" + revision = "76626ae9c91c4f2a10f34cad8ce83ea42c93bb75" + version = "v1.0" + +[[projects]] + digest = "1:9fd40d80bd0ef0cbe9fc46330c798a4185a1b309ab049455651afc50396ec924" + name = "github.com/jarcoal/httpmock" + packages = ["."] + pruneopts = "" + revision = "ac2099de8d3789d30b99b740d1a9d242097462df" + version = "v1.0.4" + +[[projects]] + digest = "1:12d3de2c11e54ea37d7f00daf85088ad5e61ec4e8a1f828d6c8b657976856be7" + name = "github.com/json-iterator/go" + packages = ["."] + pruneopts = "" + revision = "0ff49de124c6f76f8494e194af75bde0f1a49a29" + version = "v1.1.6" + +[[projects]] + digest = "1:0f51cee70b0d254dbc93c22666ea2abf211af81c1701a96d04e2284b408621db" + name = "github.com/konsorten/go-windows-terminal-sequences" + packages = ["."] + pruneopts = "" + revision = "f55edac94c9bbba5d6182a4be46d86a2c9b5b50e" + version = "v1.0.2" + +[[projects]] + digest = "1:3108ec0946181c60040ff51b811908f89d03e521e2b4ade5ef5c65b3c0e911ae" + name = "github.com/kr/pretty" + packages = ["."] + pruneopts = "" + revision = "73f6ac0b30a98e433b289500d779f50c1a6f0712" + version = "v0.1.0" + +[[projects]] + digest = "1:11b056b4421396ab14e384ab8ab8c2079b03f1e51aa5eb4d9b81f9e0d1aa8fbf" + name = "github.com/kr/text" + packages = ["."] + pruneopts = "" + revision = "e2ffdb16a802fe2bb95e2e35ff34f0e53aeef34f" + version = "v0.1.0" + +[[projects]] + digest = "1:35fe7fc05f04f79af905348b757b440723f67534f873abfef906e1a64dfe9e64" + name = "github.com/kubernetes-sigs/controller-runtime" + packages = ["pkg/runtime/signals"] + pruneopts = "" + revision = "12d98582e72927b6cd0123e2b4e819f9341ce62c" + version = "v0.1.10" + +[[projects]] + digest = "1:6a237a8e6d28b2805e4327cb1ab8a0551424284fd9f558b2d835bcb385f1e8fa" + name = "github.com/lyft/flytestdlib" + packages = [ + "atomic", + "config", + "config/files", + "config/viper", + "contextutils", + "logger", + "profutils", + "promutils", + "promutils/labeled", + "version", + ] + pruneopts = "" + revision = "7fc3ceecab7e0edcbdc4520f6437d1f9bdfecaec" + version = "v0.2.5" + +[[projects]] + digest = "1:ae39921edb7f801f7ce1b6b5484f9715a1dd2b52cb645daef095cd10fd6ee774" + name = "github.com/magiconair/properties" + packages = ["."] + pruneopts = "" + revision = "de8848e004dd33dc07a2947b3d76f618a7fc7ef1" + version = "v1.8.1" + +[[projects]] + branch = "master" + digest = "1:cae59d7b8243c671c9f544965522ba35c0fec48ee80adb9f1400cd2f33abbbec" + name = "github.com/mailru/easyjson" + packages = [ + "buffer", + "jlexer", + "jwriter", + ] + pruneopts = "" + revision = "1ea4449da9834f4d333f1cc461c374aea217d249" + +[[projects]] + branch = "master" + digest = "1:58050e2bc9621cc6b68c1da3e4a0d1c40ad1f89062b9855c26521fd42a97a106" + name = "github.com/mattbaird/jsonpatch" + packages = ["."] + pruneopts = "" + revision = "81af80346b1a01caae0cbc27fd3c1ba5b11e189f" + +[[projects]] + digest = "1:9ea83adf8e96d6304f394d40436f2eb44c1dc3250d223b74088cc253a6cd0a1c" + name = "github.com/mattn/go-colorable" + packages = ["."] + pruneopts = "" + revision = "167de6bfdfba052fa6b2d3664c8f5272e23c9072" + version = "v0.0.9" + +[[projects]] + digest = "1:d0600e4cf07697303f37130791b2ce4577367931416bea8ec4f601bde3f7c5bf" + name = "github.com/mattn/go-isatty" + packages = ["."] + pruneopts = "" + revision = "c2a7a6ca930a4cd0bc33a3f298eb71960732a3a7" + version = "v0.0.7" + +[[projects]] + digest = "1:63722a4b1e1717be7b98fc686e0b30d5e7f734b9e93d7dee86293b6deab7ea28" + name = "github.com/matttproud/golang_protobuf_extensions" + packages = ["pbutil"] + pruneopts = "" + revision = "c12348ce28de40eed0136aa2b644d0ee0650e56c" + version = "v1.0.1" + +[[projects]] + digest = "1:bcc46a0fbd9e933087bef394871256b5c60269575bb661935874729c65bbbf60" + name = "github.com/mitchellh/mapstructure" + packages = ["."] + pruneopts = "" + revision = "3536a929edddb9a5b34bd6861dc4a9647cb459fe" + version = "v1.1.2" + +[[projects]] + digest = "1:0c0ff2a89c1bb0d01887e1dac043ad7efbf3ec77482ef058ac423d13497e16fd" + name = "github.com/modern-go/concurrent" + packages = ["."] + pruneopts = "" + revision = "bacd9c7ef1dd9b15be4a9909b8ac7a4e313eec94" + version = "1.0.3" + +[[projects]] + digest = "1:e32bdbdb7c377a07a9a46378290059822efdce5c8d96fe71940d87cb4f918855" + name = "github.com/modern-go/reflect2" + packages = ["."] + pruneopts = "" + revision = "4b7aa43c6742a2c18fdef89dd197aaae7dac7ccd" + version = "1.0.1" + +[[projects]] + digest = "1:a5484d4fa43127138ae6e7b2299a6a52ae006c7f803d98d717f60abf3e97192e" + name = "github.com/pborman/uuid" + packages = ["."] + pruneopts = "" + revision = "adf5a7427709b9deb95d29d3fa8a2bf9cfd388f1" + version = "v1.2" + +[[projects]] + digest = "1:3d2c33720d4255686b9f4a7e4d3b94938ee36063f14705c5eb0f73347ed4c496" + name = "github.com/pelletier/go-toml" + packages = ["."] + pruneopts = "" + revision = "728039f679cbcd4f6a54e080d2219a4c4928c546" + version = "v1.4.0" + +[[projects]] + branch = "master" + digest = "1:5f0faa008e8ff4221b55a1a5057c8b02cb2fd68da6a65c9e31c82b72cbc836d0" + name = "github.com/petar/GoLLRB" + packages = ["llrb"] + pruneopts = "" + revision = "33fb24c13b99c46c93183c291836c573ac382536" + +[[projects]] + digest = "1:4709c61d984ef9ba99b037b047546d8a576ae984fb49486e48d99658aa750cd5" + name = "github.com/peterbourgon/diskv" + packages = ["."] + pruneopts = "" + revision = "0be1b92a6df0e4f5cb0a5d15fb7f643d0ad93ce6" + version = "v3.0.0" + +[[projects]] + digest = "1:1d7e1867c49a6dd9856598ef7c3123604ea3daabf5b83f303ff457bcbc410b1d" + name = "github.com/pkg/errors" + packages = ["."] + pruneopts = "" + revision = "ba968bfe8b2f7e042a574c888954fccecfa385b4" + version = "v0.8.1" + +[[projects]] + digest = "1:256484dbbcd271f9ecebc6795b2df8cad4c458dd0f5fd82a8c2fa0c29f233411" + name = "github.com/pmezard/go-difflib" + packages = ["difflib"] + pruneopts = "" + revision = "792786c7400a136282c1664665ae0a8db921c6c2" + version = "v1.0.0" + +[[projects]] + digest = "1:6f218995d6a74636cfcab45ce03005371e682b4b9bee0e5eb0ccfd83ef85364f" + name = "github.com/prometheus/client_golang" + packages = [ + "prometheus", + "prometheus/internal", + "prometheus/promhttp", + ] + pruneopts = "" + revision = "505eaef017263e299324067d40ca2c48f6a2cf50" + version = "v0.9.2" + +[[projects]] + branch = "master" + digest = "1:cd67319ee7536399990c4b00fae07c3413035a53193c644549a676091507cadc" + name = "github.com/prometheus/client_model" + packages = ["go"] + pruneopts = "" + revision = "fd36f4220a901265f90734c3183c5f0c91daa0b8" + +[[projects]] + digest = "1:acd87a73c6a6f2d61ad04822d68b233a5c12f5b72aef3db0985f90680e9ae8f0" + name = "github.com/prometheus/common" + packages = [ + "expfmt", + "internal/bitbucket.org/ww/goautoneg", + "log", + "model", + ] + pruneopts = "" + revision = "1ba88736f028e37bc17328369e94a537ae9e0234" + version = "v0.4.0" + +[[projects]] + branch = "master" + digest = "1:8104ddcc08a1fb39322b65c886bf3a94c216b35268c8d3eed158ca0c6615de27" + name = "github.com/prometheus/procfs" + packages = [ + ".", + "internal/fs", + ] + pruneopts = "" + revision = "5867b95ac084bbfee6ea16595c4e05ab009021da" + +[[projects]] + digest = "1:631ea4a52a20ca54eceb1077e8c7e553a4f86a58639824825d9259374f7c362f" + name = "github.com/sirupsen/logrus" + packages = ["."] + pruneopts = "" + revision = "8bdbc7bcc01dcbb8ec23dc8a28e332258d25251f" + version = "v1.4.1" + +[[projects]] + digest = "1:956f655c87b7255c6b1ae6c203ebb0af98cf2a13ef2507e34c9bf1c0332ac0f5" + name = "github.com/spf13/afero" + packages = [ + ".", + "mem", + ] + pruneopts = "" + revision = "588a75ec4f32903aa5e39a2619ba6a4631e28424" + version = "v1.2.2" + +[[projects]] + digest = "1:ae3493c780092be9d576a1f746ab967293ec165e8473425631f06658b6212afc" + name = "github.com/spf13/cast" + packages = ["."] + pruneopts = "" + revision = "8c9545af88b134710ab1cd196795e7f2388358d7" + version = "v1.3.0" + +[[projects]] + digest = "1:a1403cc8a94b8d7956ee5e9694badef0e7b051af289caad1cf668331e3ffa4f6" + name = "github.com/spf13/cobra" + packages = ["."] + pruneopts = "" + revision = "ef82de70bb3f60c65fb8eebacbb2d122ef517385" + version = "v0.0.3" + +[[projects]] + digest = "1:cc15ae4fbdb02ce31f3392361a70ac041f4f02e0485de8ffac92bd8033e3d26e" + name = "github.com/spf13/jwalterweatherman" + packages = ["."] + pruneopts = "" + revision = "94f6ae3ed3bceceafa716478c5fbf8d29ca601a1" + version = "v1.1.0" + +[[projects]] + digest = "1:cbaf13cdbfef0e4734ed8a7504f57fe893d471d62a35b982bf6fb3f036449a66" + name = "github.com/spf13/pflag" + packages = ["."] + pruneopts = "" + revision = "298182f68c66c05229eb03ac171abe6e309ee79a" + version = "v1.0.3" + +[[projects]] + digest = "1:c25a789c738f7cc8ec7f34026badd4e117853f329334a5aa45cf5d0727d7d442" + name = "github.com/spf13/viper" + packages = ["."] + pruneopts = "" + revision = "ae103d7e593e371c69e832d5eb3347e2b80cbbc9" + +[[projects]] + digest = "1:381bcbeb112a51493d9d998bbba207a529c73dbb49b3fd789e48c63fac1f192c" + name = "github.com/stretchr/testify" + packages = ["assert"] + pruneopts = "" + revision = "ffdc059bfe9ce6a4e144ba849dbedead332c6053" + version = "v1.3.0" + +[[projects]] + digest = "1:e6ff7840319b6fda979a918a8801005ec2049abca62af19211d96971d8ec3327" + name = "go.uber.org/atomic" + packages = ["."] + pruneopts = "" + revision = "df976f2515e274675050de7b3f42545de80594fd" + version = "v1.4.0" + +[[projects]] + digest = "1:22c7effcb4da0eacb2bb1940ee173fac010e9ef3c691f5de4b524d538bd980f5" + name = "go.uber.org/multierr" + packages = ["."] + pruneopts = "" + revision = "3c4937480c32f4c13a875a1829af76c98ca3d40a" + version = "v1.1.0" + +[[projects]] + digest = "1:984e93aca9088b440b894df41f2043b6a3db8f9cf30767032770bfc4796993b0" + name = "go.uber.org/zap" + packages = [ + ".", + "buffer", + "internal/bufferpool", + "internal/color", + "internal/exit", + "zapcore", + ] + pruneopts = "" + revision = "27376062155ad36be76b0f12cf1572a221d3a48c" + version = "v1.10.0" + +[[projects]] + branch = "master" + digest = "1:5b3e9450868bcf9ecbca2b01ac04f142255b5744d89ec97e1ceedf57d4522645" + name = "golang.org/x/crypto" + packages = ["ssh/terminal"] + pruneopts = "" + revision = "22d7a77e9e5f409e934ed268692e56707cd169e5" + +[[projects]] + branch = "master" + digest = "1:aa38821ad1406a84f9577465ef53e56ca4d90745a710c753190bf7792d726c82" + name = "golang.org/x/net" + packages = [ + "http/httpguts", + "http2", + "http2/hpack", + "idna", + "publicsuffix", + ] + pruneopts = "" + revision = "3ec19112720433827bbce8be9342797f5a6aaaf9" + +[[projects]] + branch = "master" + digest = "1:e7b16fd4b8b54d3c2425a52b0069b811e4583514d5dc7863a5f90d9cb145b8c1" + name = "golang.org/x/sys" + packages = [ + "unix", + "windows", + "windows/registry", + "windows/svc/eventlog", + ] + pruneopts = "" + revision = "87c872767d25fb96dfe96c794fd028b38a08440b" + +[[projects]] + digest = "1:740b51a55815493a8d0f2b1e0d0ae48fe48953bf7eaf3fcc4198823bf67768c0" + name = "golang.org/x/text" + packages = [ + "collate", + "collate/build", + "internal/colltab", + "internal/gen", + "internal/language", + "internal/language/compact", + "internal/tag", + "internal/triegen", + "internal/ucd", + "language", + "secure/bidirule", + "transform", + "unicode/bidi", + "unicode/cldr", + "unicode/norm", + "unicode/rangetable", + "width", + ] + pruneopts = "" + revision = "342b2e1fbaa52c93f31447ad2c6abc048c63e475" + version = "v0.3.2" + +[[projects]] + branch = "master" + digest = "1:9522af4be529c108010f95b05f1022cb872f2b9ff8b101080f554245673466e1" + name = "golang.org/x/time" + packages = ["rate"] + pruneopts = "" + revision = "9d24e82272b4f38b78bc8cff74fa936d31ccd8ef" + +[[projects]] + branch = "master" + digest = "1:c18a424a892e5c90d397328a6f00700c33af0be832faedca7189e2fcdca173e3" + name = "golang.org/x/tools" + packages = [ + "go/ast/astutil", + "go/gcexportdata", + "go/internal/gcimporter", + "go/internal/packagesdriver", + "go/packages", + "go/types/typeutil", + "imports", + "internal/fastwalk", + "internal/gopathwalk", + "internal/module", + "internal/semver", + ] + pruneopts = "" + revision = "7c3f65130f290a790cd2a8742f0636e46e345bc2" + +[[projects]] + digest = "1:15d017551627c8bb091bde628215b2861bed128855343fdd570c62d08871f6e1" + name = "gopkg.in/alecthomas/kingpin.v2" + packages = ["."] + pruneopts = "" + revision = "947dcec5ba9c011838740e680966fd7087a71d0d" + version = "v2.2.6" + +[[projects]] + branch = "v1" + digest = "1:1d01f96bc2293b56c3dec797b8f976d7613fb30ce92bfbc994130404f7f7f031" + name = "gopkg.in/check.v1" + packages = ["."] + pruneopts = "" + revision = "788fd78401277ebd861206a03c884797c6ec5541" + +[[projects]] + digest = "1:75fb3fcfc73a8c723efde7777b40e8e8ff9babf30d8c56160d01beffea8a95a6" + name = "gopkg.in/inf.v0" + packages = ["."] + pruneopts = "" + revision = "d2d2541c53f18d2a059457998ce2876cc8e67cbf" + version = "v0.9.1" + +[[projects]] + digest = "1:cedccf16b71e86db87a24f8d4c70b0a855872eb967cb906a66b95de56aefbd0d" + name = "gopkg.in/yaml.v2" + packages = ["."] + pruneopts = "" + revision = "51d6538a90f86fe93ac480b35f37b2be17fef232" + version = "v2.2.2" + +[[projects]] + digest = "1:2fe7efa9ea3052443378383d27c15ba088d03babe69a89815ce7fe9ec1d9aeb4" + name = "k8s.io/api" + packages = [ + "admission/v1beta1", + "admissionregistration/v1alpha1", + "admissionregistration/v1beta1", + "apps/v1", + "apps/v1beta1", + "apps/v1beta2", + "authentication/v1", + "authentication/v1beta1", + "authorization/v1", + "authorization/v1beta1", + "autoscaling/v1", + "autoscaling/v2beta1", + "batch/v1", + "batch/v1beta1", + "batch/v2alpha1", + "certificates/v1beta1", + "core/v1", + "events/v1beta1", + "extensions/v1beta1", + "networking/v1", + "policy/v1beta1", + "rbac/v1", + "rbac/v1alpha1", + "rbac/v1beta1", + "scheduling/v1alpha1", + "scheduling/v1beta1", + "settings/v1alpha1", + "storage/v1", + "storage/v1alpha1", + "storage/v1beta1", + ] + pruneopts = "" + revision = "2d6f90ab1293a1fb871cf149423ebb72aa7423aa" + version = "kubernetes-1.11.2" + +[[projects]] + digest = "1:d04bb4b31e495fa35739550e4dec2dd4c6f45c57f6d7fccb830a6ab75762efaa" + name = "k8s.io/apiextensions-apiserver" + packages = [ + "pkg/apis/apiextensions", + "pkg/apis/apiextensions/v1beta1", + "pkg/client/clientset/clientset", + "pkg/client/clientset/clientset/scheme", + "pkg/client/clientset/clientset/typed/apiextensions/v1beta1", + ] + pruneopts = "" + revision = "408db4a50408e2149acbd657bceb2480c13cb0a4" + version = "kubernetes-1.11.2" + +[[projects]] + digest = "1:b6b2fb7b4da1ac973b64534ace2299a02504f16bc7820cb48edb8ca4077183e1" + name = "k8s.io/apimachinery" + packages = [ + "pkg/api/equality", + "pkg/api/errors", + "pkg/api/meta", + "pkg/api/resource", + "pkg/apis/meta/internalversion", + "pkg/apis/meta/v1", + "pkg/apis/meta/v1/unstructured", + "pkg/apis/meta/v1beta1", + "pkg/conversion", + "pkg/conversion/queryparams", + "pkg/fields", + "pkg/labels", + "pkg/runtime", + "pkg/runtime/schema", + "pkg/runtime/serializer", + "pkg/runtime/serializer/json", + "pkg/runtime/serializer/protobuf", + "pkg/runtime/serializer/recognizer", + "pkg/runtime/serializer/streaming", + "pkg/runtime/serializer/versioning", + "pkg/selection", + "pkg/types", + "pkg/util/cache", + "pkg/util/clock", + "pkg/util/diff", + "pkg/util/errors", + "pkg/util/framer", + "pkg/util/intstr", + "pkg/util/json", + "pkg/util/mergepatch", + "pkg/util/net", + "pkg/util/rand", + "pkg/util/runtime", + "pkg/util/sets", + "pkg/util/strategicpatch", + "pkg/util/uuid", + "pkg/util/validation", + "pkg/util/validation/field", + "pkg/util/wait", + "pkg/util/yaml", + "pkg/version", + "pkg/watch", + "third_party/forked/golang/json", + "third_party/forked/golang/reflect", + ] + pruneopts = "" + revision = "103fd098999dc9c0c88536f5c9ad2e5da39373ae" + version = "kubernetes-1.11.2" + +[[projects]] + digest = "1:da788b52eda4a8cd4c564a69051b029f310f4ec232cfa3ec0e49b80b0e7b6616" + name = "k8s.io/client-go" + packages = [ + "discovery", + "discovery/fake", + "dynamic", + "kubernetes", + "kubernetes/scheme", + "kubernetes/typed/admissionregistration/v1alpha1", + "kubernetes/typed/admissionregistration/v1beta1", + "kubernetes/typed/apps/v1", + "kubernetes/typed/apps/v1beta1", + "kubernetes/typed/apps/v1beta2", + "kubernetes/typed/authentication/v1", + "kubernetes/typed/authentication/v1beta1", + "kubernetes/typed/authorization/v1", + "kubernetes/typed/authorization/v1beta1", + "kubernetes/typed/autoscaling/v1", + "kubernetes/typed/autoscaling/v2beta1", + "kubernetes/typed/batch/v1", + "kubernetes/typed/batch/v1beta1", + "kubernetes/typed/batch/v2alpha1", + "kubernetes/typed/certificates/v1beta1", + "kubernetes/typed/core/v1", + "kubernetes/typed/events/v1beta1", + "kubernetes/typed/extensions/v1beta1", + "kubernetes/typed/networking/v1", + "kubernetes/typed/policy/v1beta1", + "kubernetes/typed/rbac/v1", + "kubernetes/typed/rbac/v1alpha1", + "kubernetes/typed/rbac/v1beta1", + "kubernetes/typed/scheduling/v1alpha1", + "kubernetes/typed/scheduling/v1beta1", + "kubernetes/typed/settings/v1alpha1", + "kubernetes/typed/storage/v1", + "kubernetes/typed/storage/v1alpha1", + "kubernetes/typed/storage/v1beta1", + "pkg/apis/clientauthentication", + "pkg/apis/clientauthentication/v1alpha1", + "pkg/apis/clientauthentication/v1beta1", + "pkg/version", + "plugin/pkg/client/auth/exec", + "rest", + "rest/watch", + "restmapper", + "testing", + "tools/auth", + "tools/cache", + "tools/clientcmd", + "tools/clientcmd/api", + "tools/clientcmd/api/latest", + "tools/clientcmd/api/v1", + "tools/leaderelection", + "tools/leaderelection/resourcelock", + "tools/metrics", + "tools/pager", + "tools/record", + "tools/reference", + "transport", + "util/buffer", + "util/cert", + "util/connrotation", + "util/flowcontrol", + "util/homedir", + "util/integer", + "util/retry", + "util/workqueue", + ] + pruneopts = "" + revision = "1f13a808da65775f22cbf47862c4e5898d8f4ca1" + version = "kubernetes-1.11.2" + +[[projects]] + digest = "1:43ef9a37919f7a8948b7de4c05d20692f763adc40f15c9d330c544ae05d93947" + name = "k8s.io/code-generator" + packages = [ + "cmd/client-gen", + "cmd/client-gen/args", + "cmd/client-gen/generators", + "cmd/client-gen/generators/fake", + "cmd/client-gen/generators/scheme", + "cmd/client-gen/generators/util", + "cmd/client-gen/path", + "cmd/client-gen/types", + "cmd/conversion-gen", + "cmd/conversion-gen/args", + "cmd/conversion-gen/generators", + "cmd/deepcopy-gen", + "cmd/deepcopy-gen/args", + "cmd/defaulter-gen", + "cmd/defaulter-gen/args", + "cmd/informer-gen", + "cmd/informer-gen/args", + "cmd/informer-gen/generators", + "cmd/lister-gen", + "cmd/lister-gen/args", + "cmd/lister-gen/generators", + "cmd/openapi-gen", + "cmd/openapi-gen/args", + "pkg/util", + ] + pruneopts = "" + revision = "6702109cc68eb6fe6350b83e14407c8d7309fd1a" + version = "kubernetes-1.11.2" + +[[projects]] + branch = "master" + digest = "1:6a2a63e09a59caff3fd2d36d69b7b92c2fe7cf783390f0b7349fb330820f9a8e" + name = "k8s.io/gengo" + packages = [ + "args", + "examples/deepcopy-gen/generators", + "examples/defaulter-gen/generators", + "examples/set-gen/sets", + "generator", + "namer", + "parser", + "types", + ] + pruneopts = "" + revision = "e17681d19d3ac4837a019ece36c2a0ec31ffe985" + +[[projects]] + digest = "1:4b78eccecdf36f29cacc19ca79411f2235e0387af52b11f1d77328d7ad5d84a2" + name = "k8s.io/klog" + packages = ["."] + pruneopts = "" + revision = "e531227889390a39d9533dde61f590fe9f4b0035" + version = "v0.3.0" + +[[projects]] + branch = "master" + digest = "1:970b561bbc525ee84641edf295de3d30b5746c1b7f6ca333de37655cad160789" + name = "k8s.io/kube-openapi" + packages = [ + "cmd/openapi-gen/args", + "pkg/common", + "pkg/generators", + "pkg/generators/rules", + "pkg/util/proto", + "pkg/util/sets", + ] + pruneopts = "" + revision = "a01b7d5d6c2258c80a4a10070f3dee9cd575d9c7" + +[[projects]] + digest = "1:9e859df288d941d0d740cc923766890a5a3b4d01f9161a26829ad945b17fe750" + name = "sigs.k8s.io/controller-runtime" + packages = [ + "pkg/cache", + "pkg/cache/internal", + "pkg/client", + "pkg/client/apiutil", + "pkg/client/config", + "pkg/controller", + "pkg/event", + "pkg/handler", + "pkg/internal/controller", + "pkg/internal/recorder", + "pkg/leaderelection", + "pkg/manager", + "pkg/patch", + "pkg/predicate", + "pkg/reconcile", + "pkg/recorder", + "pkg/runtime/inject", + "pkg/runtime/log", + "pkg/source", + "pkg/source/internal", + "pkg/webhook/admission", + "pkg/webhook/admission/types", + "pkg/webhook/types", + ] + pruneopts = "" + revision = "5fd1e9e9fac5261e9ad9d47c375afc014fc31d21" + version = "v0.1.7" + +[solve-meta] + analyzer-name = "dep" + analyzer-version = 1 + input-imports = [ + "github.com/davecgh/go-spew/spew", + "github.com/go-resty/resty", + "github.com/jarcoal/httpmock", + "github.com/kubernetes-sigs/controller-runtime/pkg/runtime/signals", + "github.com/lyft/flytestdlib/config", + "github.com/lyft/flytestdlib/config/viper", + "github.com/lyft/flytestdlib/contextutils", + "github.com/lyft/flytestdlib/logger", + "github.com/lyft/flytestdlib/profutils", + "github.com/lyft/flytestdlib/promutils", + "github.com/lyft/flytestdlib/promutils/labeled", + "github.com/lyft/flytestdlib/version", + "github.com/mitchellh/mapstructure", + "github.com/pkg/errors", + "github.com/prometheus/common/log", + "github.com/spf13/cobra", + "github.com/spf13/pflag", + "github.com/stretchr/testify/assert", + "gopkg.in/check.v1", + "gopkg.in/yaml.v2", + "k8s.io/api/apps/v1", + "k8s.io/api/core/v1", + "k8s.io/api/extensions/v1beta1", + "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1beta1", + "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset", + "k8s.io/apimachinery/pkg/api/equality", + "k8s.io/apimachinery/pkg/api/errors", + "k8s.io/apimachinery/pkg/api/resource", + "k8s.io/apimachinery/pkg/apis/meta/v1", + "k8s.io/apimachinery/pkg/labels", + "k8s.io/apimachinery/pkg/runtime", + "k8s.io/apimachinery/pkg/runtime/schema", + "k8s.io/apimachinery/pkg/runtime/serializer", + "k8s.io/apimachinery/pkg/types", + "k8s.io/apimachinery/pkg/util/clock", + "k8s.io/apimachinery/pkg/util/intstr", + "k8s.io/apimachinery/pkg/util/json", + "k8s.io/apimachinery/pkg/util/yaml", + "k8s.io/apimachinery/pkg/watch", + "k8s.io/client-go/discovery", + "k8s.io/client-go/discovery/fake", + "k8s.io/client-go/kubernetes", + "k8s.io/client-go/rest", + "k8s.io/client-go/testing", + "k8s.io/client-go/tools/clientcmd", + "k8s.io/client-go/util/flowcontrol", + "k8s.io/client-go/util/homedir", + "k8s.io/code-generator/cmd/client-gen", + "k8s.io/code-generator/cmd/conversion-gen", + "k8s.io/code-generator/cmd/deepcopy-gen", + "k8s.io/code-generator/cmd/defaulter-gen", + "k8s.io/code-generator/cmd/informer-gen", + "k8s.io/code-generator/cmd/lister-gen", + "k8s.io/code-generator/cmd/openapi-gen", + "k8s.io/gengo/args", + "sigs.k8s.io/controller-runtime/pkg/cache", + "sigs.k8s.io/controller-runtime/pkg/client", + "sigs.k8s.io/controller-runtime/pkg/client/config", + "sigs.k8s.io/controller-runtime/pkg/controller", + "sigs.k8s.io/controller-runtime/pkg/event", + "sigs.k8s.io/controller-runtime/pkg/handler", + "sigs.k8s.io/controller-runtime/pkg/manager", + "sigs.k8s.io/controller-runtime/pkg/predicate", + "sigs.k8s.io/controller-runtime/pkg/reconcile", + "sigs.k8s.io/controller-runtime/pkg/source", + ] + solver-name = "gps-cdcl" + solver-version = 1 diff --git a/Gopkg.toml b/Gopkg.toml new file mode 100644 index 00000000..e7d9ac90 --- /dev/null +++ b/Gopkg.toml @@ -0,0 +1,35 @@ +required = [ + "sigs.k8s.io/controller-runtime/pkg/client/config", + "k8s.io/code-generator/cmd/defaulter-gen", + "k8s.io/code-generator/cmd/deepcopy-gen", + "k8s.io/code-generator/cmd/conversion-gen", + "k8s.io/code-generator/cmd/client-gen", + "k8s.io/code-generator/cmd/lister-gen", + "k8s.io/code-generator/cmd/informer-gen", + "k8s.io/code-generator/cmd/openapi-gen", + "k8s.io/gengo/args", +] + +[[constraint]] + name = "sigs.k8s.io/controller-runtime" + version = "^0.1.0" + +[[override]] + name = "k8s.io/code-generator" + version = "kubernetes-1.11.2" + +[[override]] + name = "k8s.io/api" + version = "kubernetes-1.11.2" + +[[override]] + name = "k8s.io/apimachinery" + version = "kubernetes-1.11.2" + +[[constraint]] + name = "k8s.io/client-go" + version = "kubernetes-1.11.2" + +[[override]] + name = "k8s.io/apiextensions-apiserver" + version = "kubernetes-1.11.2" diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..bed43751 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 Lyft, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..72656219 --- /dev/null +++ b/Makefile @@ -0,0 +1,22 @@ +export REPOSITORY=flinkk8soperator +include boilerplate/lyft/docker_build/Makefile +include boilerplate/lyft/golang_test_targets/Makefile + +.PHONY: generate +generate: + tmp/codegen/update-generated.sh + +.PHONY: compile +compile: generate + mkdir -p ./bin + go build -o bin/flinkoperator ./cmd/flinkk8soperator/main.go + +.PHONY: linux_compile +linux_compile: generate + GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o /artifacts/flinkoperator ./cmd/flinkk8soperator/main.go + +gen-config: + which pflags || (go get github.com/lyft/flytestdlib/cli/pflags) + @go generate ./... + +all: compile diff --git a/NOTICE b/NOTICE new file mode 100644 index 00000000..63527244 --- /dev/null +++ b/NOTICE @@ -0,0 +1,20 @@ +flinkk8soperator +Copyright 2019-2020 Lyft Inc. + +This product includes software developed at Lyft Inc. + +Notices for file(s): + examples/wordcount/src/main/java/org/apache/flink/ contains work from https://github.com/apache/flink under the Apache2 license. + +/* +Copyright 2016 The Kubernetes Authors. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ diff --git a/README.md b/README.md new file mode 100644 index 00000000..596a87ee --- /dev/null +++ b/README.md @@ -0,0 +1,47 @@ +[![Current Release](https://img.shields.io/github/release/lyft/flinkk8soperator.svg)](https://github.com/lyft/flinkk8soperator/releases/latest) +[![Build Status](https://travis-ci.org/lyft/flinkk8soperator.svg?branch=master)](https://travis-ci.org/lyft/flinkk8soperator) +[![GoDoc](https://godoc.org/github.com/lyft/flinkk8soperator?status.svg)](https://godoc.org/github.com/lyft/flinkk8soperator) +[![License](https://img.shields.io/badge/LICENSE-Apache2.0-ff69b4.svg)](http://www.apache.org/licenses/LICENSE-2.0.html) +[![CodeCoverage](https://img.shields.io/codecov/c/github/lyft/flinkk8soperator.svg)](https://codecov.io/gh/lyft/flinkk8soperator) +[![Go Report Card](https://goreportcard.com/badge/github.com/lyft/flinkk8soperator)](https://goreportcard.com/report/github.com/lyft/flinkk8soperator) +![Commit activity](https://img.shields.io/github/commit-activity/w/lyft/flinkk8soperator.svg?style=plastic) +![Commit since last release](https://img.shields.io/github/commits-since/lyft/flinkk8soperator/latest.svg?style=plastic) + +# Flinkk8soperator +FlinkK8sOperator is a [Kubernetes operator](https://coreos.com/operators/) that manages [Flink](https://flink.apache.org/) applications on Kubernetes. The operator acts as control plane to manage the complete deployment lifecycle of the application. + + +## Project Status + +*Alpha* + +The FlinkK8sOperator is still under active development and has not been extensively tested in production environment. Backward compatibility of the APIs is not guaranteed for alpha releases. + +## Prerequisites +* Version >= 1.9 of Kubernetes. +* Version >= 1.7 of Apache Flink. + +## Overview + +![Flink operator overview](docs/flink-operator-overview.svg) + +The goal of running Flink on Kubernetes is to enable more flexible, lighter-weight deployment of streaming applications, without needing to manage infrastructure. The Flink operator aims to abstract out the complexity of hosting, configuring, managing and operating Flink clusters from application developers. It achieves this by extending any kubernetes cluster using a [custom resources](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources). + +The Operator creates flink clusters dynamically using the specified custom resource. Flink clusters in kubernetes consist of the following: +* JobManager [Deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) +* TaskManager [Deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) +* JobManager [Service](https://kubernetes.io/docs/concepts/services-networking/service/) +* JobManager [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) for the UI + +Deploying and managing Flink applications in Kubernetes involves two steps: + +* **Building Flink application packaged as a docker image:** A docker image is built containing the application source code with the necessary dependencies built in. This is required to bootstrap the Jobmanager and Taskmanager pods. At Lyft we use Source-To-Image [S2I](https://docs.openshift.com/enterprise/3.0/using_images/s2i_images/index.html) as the image build tool that provides a common builder image with Apache Flink pre-installed. The docker image could be built using any pre-existing workflows at an organization. + +* **Creating the Flink application custom resource:** The custom resource for Flink application provides the spec for configuring and managing flink clusters in Kubernetes. The FlinkK8sOperator, deployed on Kubernetes, continuously monitors the resource and the corresponding flink cluster, and performs action based on the diff. + +## Documentation + +* [Quick start guide](/docs/quick-start-guide.md) +* [User guide](/docs/user_guide.md) +* [Flink application custom resource](/docs/crd.md) +* [Operator state machine](/docs/state_machine.md) diff --git a/artifacts/kubectl.template b/artifacts/kubectl.template new file mode 100644 index 00000000..5662a124 --- /dev/null +++ b/artifacts/kubectl.template @@ -0,0 +1,2 @@ +Namespace Name Status CreatedAt LastUpdatedAt Reason +.metadata.namespace .metadata.name .status.phase .metadata.creationTimestamp .status.last_updated_at .status.reason diff --git a/boilerplate/lyft/docker_build/Makefile b/boilerplate/lyft/docker_build/Makefile new file mode 100644 index 00000000..e7913fe4 --- /dev/null +++ b/boilerplate/lyft/docker_build/Makefile @@ -0,0 +1,7 @@ +.PHONY: docker_build +docker_build: + IMAGE_NAME=$$REPOSITORY ./boilerplate/lyft/docker_build/docker_build.sh + +.PHONY: dockerhub_push +dockerhub_push: + IMAGE_NAME=lyft/$$REPOSITORY REGISTRY=docker.io ./boilerplate/lyft/docker_build/docker_build.sh diff --git a/boilerplate/lyft/docker_build/Readme.rst b/boilerplate/lyft/docker_build/Readme.rst new file mode 100644 index 00000000..d9c4982f --- /dev/null +++ b/boilerplate/lyft/docker_build/Readme.rst @@ -0,0 +1,25 @@ +Docker Build and Push +~~~~~~~~~~~~~~~~~~~~~ + +Provides a ``make docker_build`` target that builds your image locally. + +Provides a ``make dockerhub_push`` target that pushes your final image to Dockerhub. + +The Dockerhub image will tagged ``:`` + +If git head has a git tag, the Dockerhub image will also be tagged ``:``. + +**To Enable:** + +Add ``lyft/docker_build`` to your ``boilerplate/update.cfg`` file. + +Your Dockerfile **must** use docker's `multi-stage builds `_ and name the builder stage 'builder'. + +Add ``include boilerplate/lyft/docker_build/Makefile`` in your main ``Makefile`` _after_ your REPOSITORY environment variable + +:: + + REPOSITORY= + include boilerplate/lyft/docker_build/Makefile + +(this ensures the extra Make targets get included in your main Makefile) diff --git a/boilerplate/lyft/docker_build/docker_build.sh b/boilerplate/lyft/docker_build/docker_build.sh new file mode 100755 index 00000000..a8feb983 --- /dev/null +++ b/boilerplate/lyft/docker_build/docker_build.sh @@ -0,0 +1,62 @@ +set -e + +echo "" +echo "------------------------------------" +echo " DOCKER BUILD" +echo "------------------------------------" +echo "" + +# If you have a special id_rsa file, you can pass it here. +: ${RSA_FILE=~/.ssh/id_rsa} + +if [ -n "$REGISTRY" ]; then + # Do not push if there are unstaged git changes + CHANGED=$(git status --porcelain) + if [ -n "$CHANGED" ]; then + echo "Please commit git changes before pushing to a registry" + exit 1 + fi +fi + + +GIT_SHA=$(git rev-parse HEAD) + +IMAGE_TAG_SUFFIX="" +# for intermediate build phases, append -$BUILD_PHASE to all image tags +if [ -n "$BUILD_PHASE" ]; then + IMAGE_TAG_SUFFIX="-${BUILD_PHASE}" +fi + +IMAGE_TAG_WITH_SHA="${IMAGE_NAME}:${GIT_SHA}${IMAGE_TAG_SUFFIX}" + +RELEASE_SEMVER=$(git describe --tags --exact-match "$GIT_SHA" 2>/dev/null) || true +if [ -n "$RELEASE_SEMVER" ]; then + IMAGE_TAG_WITH_SEMVER="${IMAGE_NAME}:${RELEASE_SEMVER}${IMAGE_TAG_SUFFIX}" +fi + +# build the image +docker build -t "$IMAGE_TAG_WITH_SHA" . +echo "${IMAGE_TAG_WITH_SHA} built locally." + +# if REGISTRY specified, push the images to the remote registy +if [ -n "$REGISTRY" ]; then + + if [ -n "${DOCKER_REGISTRY_PASSWORD}" ]; then + docker login --username="$DOCKER_REGISTRY_USERNAME" --password="$DOCKER_REGISTRY_PASSWORD" + fi + + docker tag "$IMAGE_TAG_WITH_SHA" "${REGISTRY}/${IMAGE_TAG_WITH_SHA}" + + docker push "${REGISTRY}/${IMAGE_TAG_WITH_SHA}" + echo "${REGISTRY}/${IMAGE_TAG_WITH_SHA} pushed to remote." + + # If the current commit has a semver tag, also push the images with the semver tag + if [ -n "$RELEASE_SEMVER" ]; then + + docker tag "$IMAGE_TAG_WITH_SHA" "${REGISTRY}/${IMAGE_TAG_WITH_SEMVER}" + + docker push "${REGISTRY}/${IMAGE_TAG_WITH_SEMVER}" + echo "${REGISTRY}/${IMAGE_TAG_WITH_SEMVER} pushed to remote." + + fi +fi diff --git a/boilerplate/lyft/golang_test_targets/81868GOPATH/bin/golangci-lint b/boilerplate/lyft/golang_test_targets/81868GOPATH/bin/golangci-lint new file mode 100755 index 00000000..a96cb9c2 Binary files /dev/null and b/boilerplate/lyft/golang_test_targets/81868GOPATH/bin/golangci-lint differ diff --git a/boilerplate/lyft/golang_test_targets/Makefile b/boilerplate/lyft/golang_test_targets/Makefile new file mode 100644 index 00000000..0ed9a718 --- /dev/null +++ b/boilerplate/lyft/golang_test_targets/Makefile @@ -0,0 +1,31 @@ +.PHONY: lint +lint: #lints the package for common code smells + which golangci-lint || sh boilerplate/lyft/golang_test_targets/golangci-lint.sh -b $$GOPATH/bin v1.16.0 + golangci-lint run + +# If code is failing goimports linter, this will fix. +# skips 'vendor' +.PHONY: goimports +goimports: + @boilerplate/lyft/golang_test_targets/goimports + +.PHONY: install +install: #download dependencies (including test deps) for the package + which dep || sh boilerplate/lyft/golang_test_targets/dep_install.sh + dep ensure + +.PHONY: test_unit +test_unit: + go test -cover ./... -race + +.PHONY: test_benchmark +test_benchmark: + go test -bench . ./... + +.PHONY: test_unit_cover +test_unit_cover: + go test ./... -coverprofile /tmp/cover.out -covermode=count; go tool cover -func /tmp/cover.out + +.PHONY: test_unit_visual +test_unit_visual: + go test ./... -coverprofile /tmp/cover.out -covermode=count; go tool cover -html=/tmp/cover.out diff --git a/boilerplate/lyft/golang_test_targets/Readme.rst b/boilerplate/lyft/golang_test_targets/Readme.rst new file mode 100644 index 00000000..acc5744f --- /dev/null +++ b/boilerplate/lyft/golang_test_targets/Readme.rst @@ -0,0 +1,31 @@ +Golang Test Targets +~~~~~~~~~~~~~~~~~~~ + +Provides an ``install`` make target that uses ``dep`` install golang dependencies. + +Provides a ``lint`` make target that uses golangci to lint your code. + +Provides a ``test_unit`` target for unit tests. + +Provides a ``test_unit_cover`` target for analysing coverage of unit tests, which will output the coverage of each function and total statement coverage. + +Provides a ``test_unit_visual`` target for visualizing coverage of unit tests through an interactive html code heat map. + +Provides a ``test_benchmark`` target for benchmark tests. + +**To Enable:** + +Add ``lyft/golang_test_targets`` to your ``boilerplate/update.cfg`` file. + +Make sure you're using ``dep`` for dependency management. + +Provide a ``.golangci`` configuration (the lint target requires it). + +Add ``include boilerplate/lyft/golang_test_targets/Makefile`` in your main ``Makefile`` _after_ your REPOSITORY environment variable + +:: + + REPOSITORY= + include boilerplate/lyft/golang_test_targets/Makefile + +(this ensures the extra make targets get included in your main Makefile) diff --git a/boilerplate/lyft/golang_test_targets/dep_install.sh b/boilerplate/lyft/golang_test_targets/dep_install.sh new file mode 100755 index 00000000..9799b054 --- /dev/null +++ b/boilerplate/lyft/golang_test_targets/dep_install.sh @@ -0,0 +1,178 @@ +#!/bin/sh + +# This install script is intended to download and install the latest available +# release of the dep dependency manager for Golang. +# +# It attempts to identify the current platform and an error will be thrown if +# the platform is not supported. +# +# Environment variables: +# - INSTALL_DIRECTORY (optional): defaults to $GOPATH/bin +# - DEP_RELEASE_TAG (optional): defaults to fetching the latest release +# - DEP_OS (optional): use a specific value for OS (mostly for testing) +# - DEP_ARCH (optional): use a specific value for ARCH (mostly for testing) +# +# You can install using this script: +# $ curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh + +set -e + +RELEASES_URL="https://github.com/golang/dep/releases" + +downloadJSON() { + url="$2" + + echo "Fetching $url.." + if test -x "$(command -v curl)"; then + response=$(curl -s -L -w 'HTTPSTATUS:%{http_code}' -H 'Accept: application/json' "$url") + body=$(echo "$response" | sed -e 's/HTTPSTATUS\:.*//g') + code=$(echo "$response" | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + elif test -x "$(command -v wget)"; then + temp=$(mktemp) + body=$(wget -q --header='Accept: application/json' -O - --server-response "$url" 2> "$temp") + code=$(awk '/^ HTTP/{print $2}' < "$temp" | tail -1) + rm "$temp" + else + echo "Neither curl nor wget was available to perform http requests." + exit 1 + fi + if [ "$code" != 200 ]; then + echo "Request failed with code $code" + exit 1 + fi + + eval "$1='$body'" +} + +downloadFile() { + url="$1" + destination="$2" + + echo "Fetching $url.." + if test -x "$(command -v curl)"; then + code=$(curl -s -w '%{http_code}' -L "$url" -o "$destination") + elif test -x "$(command -v wget)"; then + code=$(wget -q -O "$destination" --server-response "$url" 2>&1 | awk '/^ HTTP/{print $2}' | tail -1) + else + echo "Neither curl nor wget was available to perform http requests." + exit 1 + fi + + if [ "$code" != 200 ]; then + echo "Request failed with code $code" + exit 1 + fi +} + +findGoBinDirectory() { + EFFECTIVE_GOPATH=$(go env GOPATH) + # CYGWIN: Convert Windows-style path into sh-compatible path + if [ "$OS_CYGWIN" = "1" ]; then + EFFECTIVE_GOPATH=$(cygpath "$EFFECTIVE_GOPATH") + fi + if [ -z "$EFFECTIVE_GOPATH" ]; then + echo "Installation could not determine your \$GOPATH." + exit 1 + fi + if [ -z "$GOBIN" ]; then + GOBIN=$(echo "${EFFECTIVE_GOPATH%%:*}/bin" | sed s#//*#/#g) + fi + if [ ! -d "$GOBIN" ]; then + echo "Installation requires your GOBIN directory $GOBIN to exist. Please create it." + exit 1 + fi + eval "$1='$GOBIN'" +} + +initArch() { + ARCH=$(uname -m) + if [ -n "$DEP_ARCH" ]; then + echo "Using DEP_ARCH" + ARCH="$DEP_ARCH" + fi + case $ARCH in + amd64) ARCH="amd64";; + x86_64) ARCH="amd64";; + i386) ARCH="386";; + ppc64) ARCH="ppc64";; + ppc64le) ARCH="ppc64le";; + s390x) ARCH="s390x";; + armv6*) ARCH="arm";; + armv7*) ARCH="arm";; + aarch64) ARCH="arm64";; + *) echo "Architecture ${ARCH} is not supported by this installation script"; exit 1;; + esac + echo "ARCH = $ARCH" +} + +initOS() { + OS=$(uname | tr '[:upper:]' '[:lower:]') + OS_CYGWIN=0 + if [ -n "$DEP_OS" ]; then + echo "Using DEP_OS" + OS="$DEP_OS" + fi + case "$OS" in + darwin) OS='darwin';; + linux) OS='linux';; + freebsd) OS='freebsd';; + mingw*) OS='windows';; + msys*) OS='windows';; + cygwin*) + OS='windows' + OS_CYGWIN=1 + ;; + *) echo "OS ${OS} is not supported by this installation script"; exit 1;; + esac + echo "OS = $OS" +} + +# identify platform based on uname output +initArch +initOS + +# determine install directory if required +if [ -z "$INSTALL_DIRECTORY" ]; then + findGoBinDirectory INSTALL_DIRECTORY +fi +echo "Will install into $INSTALL_DIRECTORY" + +# assemble expected release artifact name +if [ "${OS}" != "linux" ] && { [ "${ARCH}" = "ppc64" ] || [ "${ARCH}" = "ppc64le" ];}; then + # ppc64 and ppc64le are only supported on Linux. + echo "${OS}-${ARCH} is not supported by this instalation script" +else + BINARY="dep-${OS}-${ARCH}" +fi + +# add .exe if on windows +if [ "$OS" = "windows" ]; then + BINARY="$BINARY.exe" +fi + +# if DEP_RELEASE_TAG was not provided, assume latest +if [ -z "$DEP_RELEASE_TAG" ]; then + downloadJSON LATEST_RELEASE "$RELEASES_URL/latest" + DEP_RELEASE_TAG=$(echo "${LATEST_RELEASE}" | tr -s '\n' ' ' | sed 's/.*"tag_name":"//' | sed 's/".*//' ) +fi +echo "Release Tag = $DEP_RELEASE_TAG" + +# fetch the real release data to make sure it exists before we attempt a download +downloadJSON RELEASE_DATA "$RELEASES_URL/tag/$DEP_RELEASE_TAG" + +BINARY_URL="$RELEASES_URL/download/$DEP_RELEASE_TAG/$BINARY" +DOWNLOAD_FILE=$(mktemp) + +downloadFile "$BINARY_URL" "$DOWNLOAD_FILE" + +echo "Setting executable permissions." +chmod +x "$DOWNLOAD_FILE" + +INSTALL_NAME="dep" + +if [ "$OS" = "windows" ]; then + INSTALL_NAME="$INSTALL_NAME.exe" +fi + +echo "Moving executable to $INSTALL_DIRECTORY/$INSTALL_NAME" +mv "$DOWNLOAD_FILE" "$INSTALL_DIRECTORY/$INSTALL_NAME" diff --git a/boilerplate/lyft/golang_test_targets/goimports b/boilerplate/lyft/golang_test_targets/goimports new file mode 100755 index 00000000..86118a78 --- /dev/null +++ b/boilerplate/lyft/golang_test_targets/goimports @@ -0,0 +1 @@ +goimports -w $(find . -type f -name '*.go' -not -path "./vendor/*" -not -path "./pkg/client/*") diff --git a/boilerplate/lyft/golang_test_targets/golangci-lint.sh b/boilerplate/lyft/golang_test_targets/golangci-lint.sh new file mode 100755 index 00000000..910bc4a5 --- /dev/null +++ b/boilerplate/lyft/golang_test_targets/golangci-lint.sh @@ -0,0 +1,385 @@ +#!/bin/sh +set -e +# Code generated by godownloader on 2019-05-25T21:22:36Z. DO NOT EDIT. +# + +usage() { + this=$1 + cat </dev/null +} +echoerr() { + echo "$@" 1>&2 +} +log_prefix() { + echo "$0" +} +_logp=6 +log_set_priority() { + _logp="$1" +} +log_priority() { + if test -z "$1"; then + echo "$_logp" + return + fi + [ "$1" -le "$_logp" ] +} +log_tag() { + case $1 in + 0) echo "emerg" ;; + 1) echo "alert" ;; + 2) echo "crit" ;; + 3) echo "err" ;; + 4) echo "warning" ;; + 5) echo "notice" ;; + 6) echo "info" ;; + 7) echo "debug" ;; + *) echo "$1" ;; + esac +} +log_debug() { + log_priority 7 || return 0 + echoerr "$(log_prefix)" "$(log_tag 7)" "$@" +} +log_info() { + log_priority 6 || return 0 + echoerr "$(log_prefix)" "$(log_tag 6)" "$@" +} +log_err() { + log_priority 3 || return 0 + echoerr "$(log_prefix)" "$(log_tag 3)" "$@" +} +log_crit() { + log_priority 2 || return 0 + echoerr "$(log_prefix)" "$(log_tag 2)" "$@" +} +uname_os() { + os=$(uname -s | tr '[:upper:]' '[:lower:]') + case "$os" in + msys_nt) os="windows" ;; + esac + echo "$os" +} +uname_arch() { + arch=$(uname -m) + case $arch in + x86_64) arch="amd64" ;; + x86) arch="386" ;; + i686) arch="386" ;; + i386) arch="386" ;; + aarch64) arch="arm64" ;; + armv5*) arch="armv5" ;; + armv6*) arch="armv6" ;; + armv7*) arch="armv7" ;; + esac + echo ${arch} +} +uname_os_check() { + os=$(uname_os) + case "$os" in + darwin) return 0 ;; + dragonfly) return 0 ;; + freebsd) return 0 ;; + linux) return 0 ;; + android) return 0 ;; + nacl) return 0 ;; + netbsd) return 0 ;; + openbsd) return 0 ;; + plan9) return 0 ;; + solaris) return 0 ;; + windows) return 0 ;; + esac + log_crit "uname_os_check '$(uname -s)' got converted to '$os' which is not a GOOS value. Please file bug at https://github.com/client9/shlib" + return 1 +} +uname_arch_check() { + arch=$(uname_arch) + case "$arch" in + 386) return 0 ;; + amd64) return 0 ;; + arm64) return 0 ;; + armv5) return 0 ;; + armv6) return 0 ;; + armv7) return 0 ;; + ppc64) return 0 ;; + ppc64le) return 0 ;; + mips) return 0 ;; + mipsle) return 0 ;; + mips64) return 0 ;; + mips64le) return 0 ;; + s390x) return 0 ;; + amd64p32) return 0 ;; + esac + log_crit "uname_arch_check '$(uname -m)' got converted to '$arch' which is not a GOARCH value. Please file bug report at https://github.com/client9/shlib" + return 1 +} +untar() { + tarball=$1 + case "${tarball}" in + *.tar.gz | *.tgz) tar -xzf "${tarball}" ;; + *.tar) tar -xf "${tarball}" ;; + *.zip) unzip "${tarball}" ;; + *) + log_err "untar unknown archive format for ${tarball}" + return 1 + ;; + esac +} +http_download_curl() { + local_file=$1 + source_url=$2 + header=$3 + if [ -z "$header" ]; then + code=$(curl -w '%{http_code}' -sL -o "$local_file" "$source_url") + else + code=$(curl -w '%{http_code}' -sL -H "$header" -o "$local_file" "$source_url") + fi + if [ "$code" != "200" ]; then + log_debug "http_download_curl received HTTP status $code" + return 1 + fi + return 0 +} +http_download_wget() { + local_file=$1 + source_url=$2 + header=$3 + if [ -z "$header" ]; then + wget -q -O "$local_file" "$source_url" + else + wget -q --header "$header" -O "$local_file" "$source_url" + fi +} +http_download() { + log_debug "http_download $2" + if is_command curl; then + http_download_curl "$@" + return + elif is_command wget; then + http_download_wget "$@" + return + fi + log_crit "http_download unable to find wget or curl" + return 1 +} +http_copy() { + tmp=$(mktemp) + http_download "${tmp}" "$1" "$2" || return 1 + body=$(cat "$tmp") + rm -f "${tmp}" + echo "$body" +} +github_release() { + owner_repo=$1 + version=$2 + test -z "$version" && version="latest" + giturl="https://github.com/${owner_repo}/releases/${version}" + json=$(http_copy "$giturl" "Accept:application/json") + test -z "$json" && return 1 + version=$(echo "$json" | tr -s '\n' ' ' | sed 's/.*"tag_name":"//' | sed 's/".*//') + test -z "$version" && return 1 + echo "$version" +} +hash_sha256() { + TARGET=${1:-/dev/stdin} + if is_command gsha256sum; then + hash=$(gsha256sum "$TARGET") || return 1 + echo "$hash" | cut -d ' ' -f 1 + elif is_command sha256sum; then + hash=$(sha256sum "$TARGET") || return 1 + echo "$hash" | cut -d ' ' -f 1 + elif is_command shasum; then + hash=$(shasum -a 256 "$TARGET" 2>/dev/null) || return 1 + echo "$hash" | cut -d ' ' -f 1 + elif is_command openssl; then + hash=$(openssl -dst openssl dgst -sha256 "$TARGET") || return 1 + echo "$hash" | cut -d ' ' -f a + else + log_crit "hash_sha256 unable to find command to compute sha-256 hash" + return 1 + fi +} +hash_sha256_verify() { + TARGET=$1 + checksums=$2 + if [ -z "$checksums" ]; then + log_err "hash_sha256_verify checksum file not specified in arg2" + return 1 + fi + BASENAME=${TARGET##*/} + want=$(grep "${BASENAME}" "${checksums}" 2>/dev/null | tr '\t' ' ' | cut -d ' ' -f 1) + if [ -z "$want" ]; then + log_err "hash_sha256_verify unable to find checksum for '${TARGET}' in '${checksums}'" + return 1 + fi + got=$(hash_sha256 "$TARGET") + if [ "$want" != "$got" ]; then + log_err "hash_sha256_verify checksum for '$TARGET' did not verify ${want} vs $got" + return 1 + fi +} +cat /dev/null < ServiceAccount +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1beta1 +metadata: + name: flinkoperator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: flinkoperator +subjects: +- kind: ServiceAccount + name: flinkoperator + namespace: flink-operator diff --git a/deploy/role.yaml b/deploy/role.yaml new file mode 100644 index 00000000..b4875dfd --- /dev/null +++ b/deploy/role.yaml @@ -0,0 +1,79 @@ +# Create a ClusterRole for flinkk8soperator +# https://kubernetes.io/docs/admin/authorization/rbac/ +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: flinkoperator +rules: + - apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - services + verbs: + - create + - get + - list + - watch + - apiGroups: + - extensions + - apps + resources: + - deployments + - deployments/status + - ingresses + - ingresses/status + verbs: + - get + - list + - watch + - create + - update + - delete +#Allow Event recording access + - apiGroups: + - "" + resources: + - events + verbs: + - create + - update + - patch +#Allow Access to CRD + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch + - create + - update +#Allow Access to flink applications under flink.k8s.io + - apiGroups: + - flink.k8s.io + resources: + - flinkapplications + verbs: + - get + - list + - watch + - create + - update + - delete + - patch +--- +# Create a Service Account for flinkk8soperator +apiVersion: v1 +kind: ServiceAccount +metadata: + name: flinkoperator + namespace: flink-operator diff --git a/docs/crd.md b/docs/crd.md new file mode 100644 index 00000000..0ae464ec --- /dev/null +++ b/docs/crd.md @@ -0,0 +1,98 @@ +# Flink Application Custom Resource Definition +The [flinkapplication](https://github.com/lyft/flinkk8soperator/blob/master/deploy/crd.yaml) is a [kubernetes custom resource](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/). Once the *flinkapplication* custom resource is created in Kubernetes, the FlinkK8sOperator watches the resource and tries to move it through a series of states until the desired state is reached. + +[FlinkApplication Custom Resource Example](https://github.com/lyft/flinkk8soperator/blob/master/examples/wordcount/flink-operator-custom-resource.yaml) + +Below is the list of fields in the custom resource and their description + +* **Spec** `type:FlinkApplicationSpec required=True` + Contains the entire specification of the flink application. + + * **Image** `type:string required=True` + The image name format should be registry/repository[:tag] to pull by tag, or registry/repository[@digest] to pull by digest + + * **ImagePullPolicy** `type:v1.PullPolicy` + The default pull policy is IfNotPresent which causes the Kubelet to skip pulling an image if it already exists. + + * **ImagePullSecrets** `type:[]v1.LocalObjectReference` + Indicates name of Secrets, Kubernetes should get the credentials from. + + * **TaskManagerConfig** `type:TaskManagerConfig required=true` + Configuration for the Flink task manager + + * **Resources** `type:ResourceRequirements` + Resources for the task manager. This includes cpu, memory, storage, and ephemeral-storage. If empty the operator will + use a default value for cpu and memory. + + * **Environment** `type:EnvironmentConfig` + Configuration for setting environment variables in the task manager. + + * **TaskSlots** `type:int32 required=true` + Number of task slots per task manager + + * **OffHeapMemoryFraction** `type:float64` + A value between 0 and 1 that represents % of container memory dedicated to system / off heap. The + remaining memory is allocated for heap. + + * **JobManagerConfig** `type:JobManagerConfig` + Configuration for the Flink job manager + + * **Resources** `type:ResourceRequirements` + Resources for the job manager. This includes cpu, memory, storage, and ephemeral-storage. If empty the operator will + use a default value for cpu and memory. + + * **Environment** `type:EnvironmentConfig` + Configuration for setting environment variables in the job manager. + + * **Replicas** `type:int32 required=true` + Number of job managers for the flink cluster. If multiple job managers are provided, the user has to ensure that + correct environment variables are set for High availability mode. + + * **OffHeapMemoryFraction** `type:float64` + A value between 0 and 1 that represents % of container memory dedicated to system / off heap. The + remaining memory is allocated for heap. + + * **JarName** `type:string required=true` + Name of the jar file to be run. The application image needs to ensure that the jar file is present at the right location, as + the operator uses the Web API to submit jobs. + + * **Parallelism** `type:int32 required=true` + Job level parallelism for the Flink Job + + * **EntryClass** `type:string` + Entry point for the Flink job + + * **ProgramArgs** `type:string` + External configuration parameters to be passed as arguments to the job like input and output sources, etc + + * **SavepointInfo** `type:SavepointInfo` + Optional Savepoint info that can be passed in to indicate that the Flink job must resume from the corresponding savepoint. + + * **FlinkVersion** `type:string required=true` + The version of Flink to be managed. This version must match the version in the image. + + * **FlinkConfig** `type:FlinkConfig` + Optional map of flink configuration, which passed on to the deployment as environment variable with `OPERATOR_FLINK_CONFIG` + + * **DeploymentMode** `type:DeploymentMode` + Indicates the type of deployment that operator should perform if the custom resource is updated. Currently only Dual is supported. + + `Dual` This deployment mode is intended for applications where downtime during deployment needs to be as minimal as possible. In this deployment mode, the operator brings up a second Flink cluster with the new image, while the original Flink cluster is still active. Once the pods and containers in the new flink cluster are ready, the Operator cancels the job in the first Cluster with savepoint, deletes the cluster and starts the job in the second cluster. (More information in the state machine section below). This mode is suitable for real time processing applications. + + * **DeleteMode** `type:DeleteMode` + Indicates how Flink jobs are torn down when the FlinkApplication resource is deleted + + `Savepoint` (default) The operator will take a final savepoint before cancelling the job, and will not tear down the cluster until a savepoint has succeeded. + + `ForceCancel` The operator will force cancel the job before tearing down the cluster + + `None` The operator will immediately tear down the cluster + + * **RestartNonce** `type:string` + Can be set or modified to force a restart of the cluster + + * **Volumes** `type:[]v1.Volume` + Represents a named volume in a pod that may be accessed by any container in the pod. + + * **VolumeMounts** `type:[]v1.VolumeMount` + Describes a mounting of a Volume within a container. diff --git a/docs/flink-operator-overview.svg b/docs/flink-operator-overview.svg new file mode 100644 index 00000000..13f2e03b --- /dev/null +++ b/docs/flink-operator-overview.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/local_dev.md b/docs/local_dev.md new file mode 100644 index 00000000..50d1cbbb --- /dev/null +++ b/docs/local_dev.md @@ -0,0 +1,114 @@ +# Flink Operator local development + +This guide will describe how to get set up for local development of +the Flink Operator. This is most likely useful for people actually +developing the operator, but may also be useful for developers looking +to develop their applications locally. + +## Run the operator + +### Install [Docker for Mac](https://docs.docker.com/docker-for-mac/install/) + +Once installed and running, enabled Kuberenetes in settings (from the +docker icon in the menu bar, click Preferences -> Kubernetes -> Enable +Kubernetes). + +### (Optional) Setup kubernetes dashboard + +This can be a handy complement to the CLI, especially for new users + +```bash +$ kubectl apply -f https://raw.githubusercontent.com/kubernetes/dashboard/v1.10.0/src/deploy/recommended/kubernetes-dashboard.yaml +$ kubectl proxy & +$ open http://localhost:8001/api/v1/namespaces/kube-system/services/https:kubernetes-dashboard:/proxy/#!/overview +``` + +### Set up your Go environment + +```bash +$ export GOPATH=~/src/go +``` + +(should probably go into your shell's profile) + +### Checkout the code + +```bash +$ mkdir -p $GOPATH/src/github.com/lyft +$ cd $GOPATH/src/github.com/lyft +$ git clone git@github.com:lyft/flinkk8soperator.git +``` + +### Install the custom resource definition + +```bash +$ cd flinkk8soperator +$ kubectl create -f deploy/crd.yaml +``` + +### Start the operator + +#### Option 1: run outside the kubernetes cluster + +In this mode, we run the operator locally (on our mac) or inside the +IDE and configure it to talk to the docker-for-mac kubernetes +cluster. This is very convinient for development, as we can iterate +quickly, use a debugger, etc. + +```bash +$ dep ensure +$ KUBERNETES_CONFIG="$HOME/.kube/config" go run ./cmd/flinkk8soperator/main.go --config=local_config.yaml +``` + +(you may need to accept a firewall prompt and `brew install dep` if you don't have it installed) + +#### Option 2: run inside the kubernetes cluster + +This mode more realistically emulates how the operator will run in +production, however the turn-around time for changes is much longer. + +First we need to build the docker container for the operator: + +```bash +$ docker build -t flinkk8soperator . +``` + +Then create the operator cluster resources: + +```bash +$ kubectl create -f deploy/flinkk8soperator_local.yaml +``` + +## Run an application + +```bash +$ kubectl create -f examples/wordcount/flink-operator-custom-resource.yaml +``` + +Now you should be able to see two pods (one for the jobmanager and one +for the taskmanager) starting: + +```bash +$ kubectl get pods +``` + +You should also be able to access the jobmanager UI at: + +```bash +http://localhost:8001/api/v1/namespaces/default/services/{APP_NAME}-jm:8081/proxy/#/overview +``` + +(note you will need to be running `kubectl proxy` for this to work) + +You can tail the logs for the jobmanager (which may be useful for +debugging failures) via: + +```bash +$ kubectl logs -f service/{APP_NAME}-jm +``` + +You can SSH into the jobmanager by running + +```bash +$ kubectl exec -it $(kubectl get pods -o=custom-columns=NAME:.metadata.name | grep "\-jm\-") -- /bin/bash +``` diff --git a/docs/quick-start-guide.md b/docs/quick-start-guide.md new file mode 100644 index 00000000..093b8035 --- /dev/null +++ b/docs/quick-start-guide.md @@ -0,0 +1,135 @@ +# Quick Start Guide + +If you are looking to develop and test operator in your local machine, refer to [Local development guide](local_dev.md) + +Follow the steps below if you have Kubernetes cluster up and running. + +## Setup kubectl +Follow the instructions [here](https://kubernetes.io/docs/tasks/tools/install-kubectl/) to install and setup kubectl + +## Operator installation + +* Let's first create the custom resource definition, namespace, and roles for running the flink operator. + +```bash +$ kubectl create -f deploy/crd.yaml +$ kubectl create -f deploy/namespace.yaml +$ kubectl create -f deploy/role.yaml +$ kubectl create -f deploy/role-binding.yaml +``` + +* Before creating the flink operator deployment, edit/update the config in [config.yaml](/deploy/config.yaml) + +```yaml +Replace the {ingress_suffix} to indicate your cluster's ingress url. +data: + config: |- + operator: + ingressUrlFormat: "{{$jobCluster}}.{ingress_suffix}" + logger: + level: 4 +``` + +```bash +$ kubectl create -f deploy/config.yaml +$ kubectl create -f deploy/flinkk8soperator.yaml +``` + +* Ensure that the flink operator pod is *RUNNING*, and check operator logs if needed. + +```bash +$ kubectl get pods -n flink-operator +$ kubectl logs {pod-name} -n flink-operator +``` + +## Running the example + +You can find sample application to run with the flink operator [here](/examples/). To run a flink application, run the following command: + +```bash +$ kubectl create -f examples/wordcount/flink-operator-custom-resource.yaml -n flink-operator +``` + +The above command will create the flink application custom resource in kubernetes. The operator will observe the custom resource, and will create a flink cluster in kubernetes. + +Command below should show deployments created for the application +```bash +$ kubectl get deployments -n flink-operator +``` + +Check the phase and other status attributes in the custom resource +```bash +$ kubectl get flinkapplication.flink.k8s.io -n flink-operator wordcount-operator-example -o yaml +``` + +The output should be something like this +```yaml +apiVersion: flink.k8s.io/v1alpha1 +kind: FlinkApplication +metadata: + clusterName: "" + creationTimestamp: "2019-05-03T01:29:27Z" + generation: 1 + labels: + environment: development + name: wordcount-operator-example + namespace: flink-operator + resourceVersion: "51383673" + selfLink: /apis/flink.k8s.io/v1alpha1/namespaces/flink-operator/flinkapplications/wordcount-operator-example + uid: e415a43d-6d42-11e9-bf89-0acd1b812506 +spec: + deploymentMode: Single + entryClass: org.apache.flink.WordCount + flinkConfig: + state.backend.fs.checkpointdir: file:///checkpoints/flink/checkpoints + state.checkpoints.dir: file:///checkpoints/flink/externalized-checkpoints + state.savepoints.dir: file:///checkpoints/flink/savepoints + taskmanager.heap.size: 200 + flinkVersion: "1.8" + image: docker.io/lyft/wordcount-operator-example:latest + jarName: wordcount-operator-example-1.0.0-SNAPSHOT.jar + jobManagerConfig: + envConfig: {} + replicas: 1 + resources: + requests: + cpu: 200m + memory: 200Mi + parallelism: 3 + restartNonce: "" + savepointInfo: {} + taskManagerConfig: + envConfig: {} + resources: + requests: + cpu: 200m + memory: 200Mi + taskSlots: 2 +status: + clusterStatus: + availableTaskSlots: 4 + health: Green + healthyTaskManagers: 2 + numberOfTaskManagers: 2 + numberOfTaskSlots: 4 + jobId: 8bda8bbe03946a690cc0f28f9a6f307f + lastUpdatedAt: "2019-05-03T01:30:28Z" + phase: Running +``` + +To check events for the `FlinkApplication` object, run the following command: + +```bash +$ kubectl describe flinkapplication.flink.k8s.io -n flink-operator wordcount-operator-example +``` + +This will show the events similarly to the following: + +``` +Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Normal Update 35s flinkk8soperator Flink cluster created + Normal Update 5s flinkk8soperator Flink cluster is ready + Normal Update 4s flinkk8soperator Flink job submitted to cluster +``` diff --git a/docs/state_machine.md b/docs/state_machine.md new file mode 100644 index 00000000..8e1b111c --- /dev/null +++ b/docs/state_machine.md @@ -0,0 +1,59 @@ +# Flink operator state machine + +The core logic of the operator resides in the state machine. Various stages of the deployment lifecycle are mapped to +discrete states. The operator continuously monitors the FlinkApplication custom resource. When it becomes out of sync +with the underlying Kubernetes resources, it takes the necessary actions to update those resources to the desired state. +Typically this will involve traversing the state machine. The final desired state is `Running`, which indicates that a +healthy Flink cluster has been started and the Flink job has been successfully submitted. + +The full state machine looks like this: +![Flink operator state machine](state_machine.png) + +# States + +### New / Updating +`New` (indicated in the resource by the empty string) is the initial state that all FlinkApplication resources start in. +`Updating` is transitioned to when a change is made to an existing FlinkApplication. In both cases, a new cluster is +created, and we transition to the ClusterStarting phase to monitor. The deployment objects created by the operator are +labelled and annotated as indicated in the custom resource. The operator also sets the corresponding environment +variables and arguments for the containers to start up the Flink application from the image. + +### ClusterStarting +In this state, the operator monitors the Flink cluster created in the New state. Once it successfully starts, we +transition to the `Savepointing` state. Otherwise, if we are unable to start the cluster for some reason (an invalid +image, bad configuration, not enough Kubernetes resources, etc.), we transition to the `DeployFailed` state. + +### Savepointing +In the `Savepointing` state, the operator attempts to cancel the existing job with a +[savepoint](https://ci.apache.org/projects/flink/flink-docs-release-1.8/ops/state/savepoints.html) (if this is the first +deploy for the FlinkApplication and there is no existing job, we transition straight to `SubmittingJob`). The operator +monitors the savepoint process until it succeeds or fails. If savepointing fails, the operator will look for an +[externalized checkpoint](https://ci.apache.org/projects/flink/flink-docs-release-1.8/ops/state/checkpoints.html#resuming-from-a-retained-checkpoint). +If none are available, the application transitions to the `DeployFailed` state. Otherwise, it transitions to the +`SubmittingJob` state. + +### SubmittingJob +In this state, the operator waits until the JobManager is ready, then attempts to submit the Flink job to the cluster. +If we are updating an existing job or the user has specified a savepoint to restore from, that will be used. Once the +job is successfully running the application transitions to the `Running` state. If the job submission fails we +transition to the `RollingBack` state. + +### RollingBack +This state is reached when, in the middle of a deploy, the old job has been canceled but the new job did not come up +successfully. In that case we will attempt to roll back by resubmitting the old job on the old cluster, after which +we transition to the `DeployFailed` state. + +### Running +The `Running` state indicates that the FlinkApplication custom resource has reached the desired state, and the job is +running in the Flink cluster. In this state the operator continuously checks if the resource has been modified and +monitors the health of the Flink cluster and job. + +### DeployFailed +The `DeployFailed` state operates exactly like the `Running` state. It exists to inform the user that an attempted +update has failed, i.e., that the FlinkApplication status does not currently match the desired spec. In this state, +the user should look at the Flink logs and Kubernetes events to determine what went wrong. The user can then perform +a new deploy by updating the FlinkApplication. + +### Deleting +This state indicates that the FlinkApplication resource has been deleted. The operator will clean up the job according +to the DeleteMode configured. Once all clean up steps have been performed the FlinkApplication will be deleted. diff --git a/docs/state_machine.mmd b/docs/state_machine.mmd new file mode 100644 index 00000000..c10f5f44 --- /dev/null +++ b/docs/state_machine.mmd @@ -0,0 +1,30 @@ +%% This file can be compiled into state_machine.png by installing mermaidjs (https://mermaidjs.github.io/) and running +%% mmdc -i state_machine.mmd -o state_machine.png -w 1732 -b transparent + +graph LR +New --> ClusterStarting + +subgraph Running +Running +DeployFailed +end + +subgraph Updating +Running --> Updating +Updating --> ClusterStarting +DeployFailed --> Updating + +ClusterStarting --> Savepointing +ClusterStarting -- Create fails --> DeployFailed + +Savepointing --> SubmittingJob +Savepointing -- Savepoint fails and no externalized checkpoint --> DeployFailed + +SubmittingJob --> Running +SubmittingJob -- job start fails --> RollingBackJob +RollingBackJob --> DeployFailed +end + +linkStyle 5 stroke:#FF0000 +linkStyle 7 stroke:#FF0000 +linkStyle 9 stroke:#FF0000 diff --git a/docs/state_machine.png b/docs/state_machine.png new file mode 100644 index 00000000..6b364bd7 Binary files /dev/null and b/docs/state_machine.png differ diff --git a/docs/user_guide.md b/docs/user_guide.md new file mode 100644 index 00000000..e56bbc2b --- /dev/null +++ b/docs/user_guide.md @@ -0,0 +1,28 @@ +# User Guide + +For a quick introduction on how to build and install the Kubernetes Operator for Apache Flink, and how to run some sample applications, please refer to the [Quick Start Guide](quick-start-guide.md). For a complete reference of the custom resource definition of the `FlinkApplication`, please refer to the [API Specification](crd.md). + +## Working with FlinkApplications + +### Building a new Flink application +The flink operator brings up Jobmanager and Taskmanager for an application in Kubernetes. It does this by creating [deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) objects based on the *image* field in the flink application custom resource object. For more information about building images, please refer to this [documentation](/examples/README.md) and [examples](/examples/wordcount/) + +### Creating a New FlinkApplication + +A `FlinkApplication` can be created from a YAML file storing the `FlinkApplication` specification using either the `kubectl apply -f ` command. Once a `FlinkApplication` is successfully created, the operator will receive it and creates a flink cluster as configured in the specification to run on the Kubernetes cluster. + +### Deleting a FlinkApplication + +A `FlinkApplication` can be deleted using either the `kubectl delete ` command. Deleting a `Flinkapplication` deletes the Flink application custom resource and flink cluster associated with it. If the flink job is running when the deletion happens, the flink job is cancelled with savepoint before the cluster is deleted. + +### Updating an existing FlinkApplication + +A `FlinkApplication` can be updated using the `kubectl apply -f ` command. When a `FlinkApplication` is successfully updated, the operator observes that the resource has changed. The operator before deleting the existing deployment, will cancel the flink job with savepoint. After the savepoint succeeds, the operator deletes the existing deployment and submits a new flink job from the savepoint in the new flink cluster. + +### Checking a FlinkApplication + +A `FlinkApplication` can be checked using the `kubectl describe flinkapplication.flink.k8s.io ` command. The output of the command shows the specification and status of the `FlinkApplication` as well as events associated with it. + +## Customizing the flink operator + +To customize the flink operator, set/update these [configurations](https://github.com/lyft/flinkk8soperator/blob/master/pkg/controller/config/config.go). The values for config can be set either through [configmap](/deploy/config.yaml) or through command line. diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 00000000..f12b8b46 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,9 @@ +# Application examples + +This contains examples showing applications that produce a docker image which is compatible with the Flink operator. Please use these examples as a reference while building applications to be be executed by the flink operator. + +* The Flink operator custom resource contains **image** field, and expects the image to have both flink and application code to be packaged in it. +* The operator starts up Jobmanager and Taskmanager pods using [Container Args](https://godoc.org/k8s.io/api/core/v1#Container). +* The operator submits the flink job through the [REST API](https://ci.apache.org/projects/flink/flink-docs-stable/monitoring/rest_api.html#jars-jarid-run) in the Jobmanager. For this to work, the jar file of the application needs to be present in the folder as indicated by the config value [web.upload.dir](https://ci.apache.org/projects/flink/flink-docs-stable/ops/config.html#web-upload-dir). +* The operator injects flink configuration through [environment variables](https://github.com/lyft/flinkk8soperator/blob/master/pkg/controller/flink/container_utils.go#L84). The image should have the support to integrate these into the existing configuration as illustrated in [example here](https://github.com/lyft/flinkk8soperator/blob/master/examples/wordcount/docker-entrypoint.sh#L26) +* If there are issues in the **image** that causes either pods to restart, or Flink cluster to not respond to REST API requests, the state machine will not transition beyond the **READY** state. diff --git a/examples/wordcount/Dockerfile b/examples/wordcount/Dockerfile new file mode 100644 index 00000000..9577f4b9 --- /dev/null +++ b/examples/wordcount/Dockerfile @@ -0,0 +1,69 @@ +FROM openjdk:8-jdk + +# Prepare environment +ENV FLINK_HOME=/opt/flink +ENV MAVEN_HOME=/opt/maven +ENV HADOOP_HOME=/opt/hadoop +ENV PATH=$FLINK_HOME/bin:$HADOOP_HOME/bin:$MAVEN_HOME/bin:$PATH + +COPY . /code + +# Configure Flink version +ENV FLINK_VERSION=1.8.0 \ + HADOOP_SCALA_VARIANT=scala_2.11 + +# Install dependencies +RUN set -ex; \ + apt-get update; \ + apt-get -y install libsnappy1v5; \ + apt-get -y install netcat net-tools; \ + apt-get -y install gettext-base; \ + rm -rf /var/lib/apt/lists/* + +# Grab gosu for easy step-down from root +ENV GOSU_VERSION 1.7 +RUN set -ex; \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-$(dpkg --print-architecture)"; \ + wget -nv -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-$(dpkg --print-architecture).asc"; \ + export GNUPGHOME="$(mktemp -d)"; \ + rm -rf "$GNUPGHOME" /usr/local/bin/gosu.asc; \ + chmod +x /usr/local/bin/gosu; \ + gosu nobody true + +# Install Maven +RUN \ + wget https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.5.3/apache-maven-3.5.3-bin.tar.gz; \ + tar -zxvf apache-maven-3.5.3-bin.tar.gz; \ + mv apache-maven-3.5.3 $MAVEN_HOME; \ + rm apache-maven-3.5.3-bin.tar.gz + +WORKDIR /code + +RUN \ + mvn package; \ + mkdir -p /opt/flink/flink-web-upload; \ + cp flink-conf.yaml /usr/local/; \ + cp /code/target/*.jar /opt/flink/flink-web-upload/ + +RUN groupadd --system --gid=9999 flink && \ + useradd --system --home-dir $FLINK_HOME --uid=9999 --gid=flink flink +WORKDIR $FLINK_HOME + +ENV FLINK_URL_FILE_PATH=flink/flink-${FLINK_VERSION}/flink-${FLINK_VERSION}-bin-${HADOOP_SCALA_VARIANT}.tgz +ENV FLINK_TGZ_URL=https://mirrors.ocf.berkeley.edu/apache/$FLINK_URL_FILE_PATH + +# Install Flink +RUN set -ex; \ + wget -nv -O flink.tgz "$FLINK_TGZ_URL"; \ + \ + tar -xf flink.tgz --strip-components=1; \ + rm flink.tgz; \ + \ + chown -R flink:flink .; + +# control script expects manifest.yaml at this location +RUN chown -R flink:flink /var +COPY docker-entrypoint.sh / +ENTRYPOINT ["/docker-entrypoint.sh"] +EXPOSE 6123 8081 +CMD ["local"] diff --git a/examples/wordcount/docker-entrypoint.sh b/examples/wordcount/docker-entrypoint.sh new file mode 100755 index 00000000..7b336756 --- /dev/null +++ b/examples/wordcount/docker-entrypoint.sh @@ -0,0 +1,51 @@ +#!/bin/sh + +drop_privs_cmd() { + if [ -x /sbin/su-exec ]; then + # Alpine + echo su-exec + else + # Others + echo gosu + fi +} + + +envsubst < /usr/local/flink-conf.yaml > $FLINK_HOME/conf/flink-conf.yaml + +# As the taskmanager pods are accessible only by (cluster) ip address, +# we must manually configure this based on the podIp kubernetes +# variable, which is assigned to TASKMANAGER_HOSTNAME env var by the +# operator. +if [ -n "$TASKMANAGER_HOSTNAME" ]; then + echo "taskmanager.host: $TASKMANAGER_HOSTNAME" >> "$FLINK_HOME/conf/flink-conf.yaml" +fi + +# Add in extra configs set by the operator +if [ -n "$OPERATOR_FLINK_CONFIG" ]; then + echo "$OPERATOR_FLINK_CONFIG" >> "$FLINK_HOME/conf/flink-conf.yaml" +fi + +COMMAND=$@ + +if [ $# -lt 1 ]; then + COMMAND="local" +fi + +if [ "$COMMAND" = "help" ]; then + echo "Usage: $(basename "$0") (jobmanager|taskmanager|local|help)" + exit 0 +elif [ "$COMMAND" = "jobmanager" ]; then + echo "Starting Job Manager" + echo "config file: " && grep '^[^\n#]' "$FLINK_HOME/conf/flink-conf.yaml" + exec $(drop_privs_cmd) flink "$FLINK_HOME/bin/jobmanager.sh" start-foreground +elif [ "$COMMAND" = "taskmanager" ]; then + echo "Starting Task Manager" + echo "config file: " && grep '^[^\n#]' "$FLINK_HOME/conf/flink-conf.yaml" + exec $(drop_privs_cmd) flink "$FLINK_HOME/bin/taskmanager.sh" start-foreground +elif [ "$COMMAND" = "local" ]; then + echo "Starting local cluster" + exec $(drop_privs_cmd) flink "$FLINK_HOME/bin/jobmanager.sh" start-foreground local +fi + +exec "$@" diff --git a/examples/wordcount/flink-conf.yaml b/examples/wordcount/flink-conf.yaml new file mode 100644 index 00000000..24eb9bc2 --- /dev/null +++ b/examples/wordcount/flink-conf.yaml @@ -0,0 +1,30 @@ +jobmanager.web.submit.enable: true +jobmanager.web.log.path: /var/log/jobmanager/current + +jobmanager.web.upload.dir: /opt/flink + +taskmanager.log.path: /var/log/taskmanager/current +taskmanager.exit-on-fatal-akka-error: true +taskmanager.network.memory.max: 2147483648 +taskmanager.network.memory.fraction: 0.125 + +# Akka config +akka.framesize: 20MB +parallelism.default: 1 + +# State backend config +state.backend: rocksdb +state.checkpoints.num-retained: 4 + +# Restart strategy +restart-strategy: fixed-delay +restart-strategy.fixed-delay.delay: 0s +restart-strategy.fixed-delay.attempts: 2147483647 + +# These parameters control how often TaskManagers try to connect to a JobManager. +# These values are set a bit lower than the defaults to make recovery and cluster restarts +# a bit faster +taskmanager.maxRegistrationDuration: Inf +taskmanager.initial-registration-pause: 500 ms +taskmanager.max-registration-pause: 5 s +taskmanager.refused-registration-pause: 5 s diff --git a/examples/wordcount/flink-operator-custom-resource.yaml b/examples/wordcount/flink-operator-custom-resource.yaml new file mode 100644 index 00000000..6304ed46 --- /dev/null +++ b/examples/wordcount/flink-operator-custom-resource.yaml @@ -0,0 +1,31 @@ +apiVersion: flink.k8s.io/v1alpha1 +kind: FlinkApplication +metadata: + name: wordcount-operator-example + annotations: + labels: + environment: development +spec: + image: docker.io/lyft/wordcount-operator-example:latest + flinkConfig: + taskmanager.heap.size: 200 + state.backend.fs.checkpointdir: file:///checkpoints/flink/checkpoints + state.checkpoints.dir: file:///checkpoints/flink/externalized-checkpoints + state.savepoints.dir: file:///checkpoints/flink/savepoints + jobManagerConfig: + resources: + requests: + memory: "200Mi" + cpu: "0.2" + replicas: 1 + taskManagerConfig: + taskSlots: 2 + resources: + requests: + memory: "200Mi" + cpu: "0.2" + flinkVersion: "1.8" + jarName: "wordcount-operator-example-1.0.0-SNAPSHOT.jar" + parallelism: 3 + entryClass: "org.apache.flink.WordCount" + diff --git a/examples/wordcount/pom.xml b/examples/wordcount/pom.xml new file mode 100644 index 00000000..17866b1e --- /dev/null +++ b/examples/wordcount/pom.xml @@ -0,0 +1,31 @@ + + + 4.0.0 + org.apache.flink + wordcount-operator-example + 1.0.0-SNAPSHOT + jar + + + 1.8 + 1.8 + + + WordCount + + + + org.apache.flink + flink-java + 1.8.0 + + + org.apache.flink + flink-streaming-java_2.11 + 1.8.0 + + + + diff --git a/examples/wordcount/src/main/java/org/apache/flink/WordCount.java b/examples/wordcount/src/main/java/org/apache/flink/WordCount.java new file mode 100644 index 00000000..b31e12b3 --- /dev/null +++ b/examples/wordcount/src/main/java/org/apache/flink/WordCount.java @@ -0,0 +1,120 @@ +// https://github.com/apache/flink/blob/master/flink-examples/flink-examples-streaming/src/main/java/org/apache/flink/streaming/examples/wordcount/WordCount.java + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink; + +import org.apache.flink.api.common.functions.FlatMapFunction; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.utils.ParameterTool; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.util.WordCountData; +import org.apache.flink.util.Collector; + +/** + * Implements the "WordCount" program that computes a simple word occurrence + * histogram over text files in a streaming fashion. + * + *

The input is a plain text file with lines separated by newline characters. + * + *

Usage: WordCount --input <path> --output <path>
+ * If no parameters are provided, the program is run with default data from + * {@link WordCountData}. + * + *

This example shows how to: + *

    + *
  • write a simple Flink Streaming program, + *
  • use tuple data types, + *
  • write and use user-defined functions. + *
+ */ +public class WordCount { + + // ************************************************************************* + // PROGRAM + // ************************************************************************* + + public static void main(String[] args) throws Exception { + + // Checking input parameters + final ParameterTool params = ParameterTool.fromArgs(args); + + // set up the execution environment + final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + // make parameters available in the web interface + env.getConfig().setGlobalJobParameters(params); + + // get input data + DataStream text; + if (params.has("input")) { + // read the text file from given input path + text = env.readTextFile(params.get("input")); + } else { + System.out.println("Executing WordCount example with default input data set."); + System.out.println("Use --input to specify file input."); + // get default test text data + text = env.fromElements(WordCountData.WORDS); + } + + DataStream> counts = + // split up the lines in pairs (2-tuples) containing: (word,1) + text.flatMap(new Tokenizer()) + // group by the tuple field "0" and sum up tuple field "1" + .keyBy(0).sum(1); + + // emit result + if (params.has("output")) { + counts.writeAsText(params.get("output")); + } else { + System.out.println("Printing result to stdout. Use --output to specify output path."); + counts.print(); + } + + // execute program + env.execute("Streaming WordCount"); + } + + // ************************************************************************* + // USER FUNCTIONS + // ************************************************************************* + + /** + * Implements the string tokenizer that splits sentences into words as a + * user-defined FlatMapFunction. The function takes a line (String) and + * splits it into multiple pairs in the form of "(word,1)" ({@code Tuple2}). + */ + public static final class Tokenizer implements FlatMapFunction> { + + @Override + public void flatMap(String value, Collector> out) { + // normalize and split the line + String[] tokens = value.toLowerCase().split("\\W+"); + + // emit the pairs + for (String token : tokens) { + if (token.length() > 0) { + out.collect(new Tuple2<>(token, 1)); + } + } + } + } + +} diff --git a/examples/wordcount/src/main/java/org/apache/flink/util/WordCountData.java b/examples/wordcount/src/main/java/org/apache/flink/util/WordCountData.java new file mode 100644 index 00000000..012ab42e --- /dev/null +++ b/examples/wordcount/src/main/java/org/apache/flink/util/WordCountData.java @@ -0,0 +1,74 @@ +// Copied from https://github.com/apache/flink/blob/master/flink-examples/flink-examples-batch/src/main/java/org/apache/flink/examples/java/wordcount/util/WordCountData.java + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.util; + +import org.apache.flink.api.java.DataSet; +import org.apache.flink.api.java.ExecutionEnvironment; + +/** + * Provides the default data sets used for the WordCount example program. + * The default data sets are used, if no parameters are given to the program. + * + */ +public class WordCountData { + + public static final String[] WORDS = new String[] { + "To be, or not to be,--that is the question:--", + "Whether 'tis nobler in the mind to suffer", + "The slings and arrows of outrageous fortune", + "Or to take arms against a sea of troubles,", + "And by opposing end them?--To die,--to sleep,--", + "No more; and by a sleep to say we end", + "The heartache, and the thousand natural shocks", + "That flesh is heir to,--'tis a consummation", + "Devoutly to be wish'd. To die,--to sleep;--", + "To sleep! perchance to dream:--ay, there's the rub;", + "For in that sleep of death what dreams may come,", + "When we have shuffled off this mortal coil,", + "Must give us pause: there's the respect", + "That makes calamity of so long life;", + "For who would bear the whips and scorns of time,", + "The oppressor's wrong, the proud man's contumely,", + "The pangs of despis'd love, the law's delay,", + "The insolence of office, and the spurns", + "That patient merit of the unworthy takes,", + "When he himself might his quietus make", + "With a bare bodkin? who would these fardels bear,", + "To grunt and sweat under a weary life,", + "But that the dread of something after death,--", + "The undiscover'd country, from whose bourn", + "No traveller returns,--puzzles the will,", + "And makes us rather bear those ills we have", + "Than fly to others that we know not of?", + "Thus conscience does make cowards of us all;", + "And thus the native hue of resolution", + "Is sicklied o'er with the pale cast of thought;", + "And enterprises of great pith and moment,", + "With this regard, their currents turn awry,", + "And lose the name of action.--Soft you now!", + "The fair Ophelia!--Nymph, in thy orisons", + "Be all my sins remember'd." + }; + + public static DataSet getDefaultTextLineDataSet(ExecutionEnvironment env) { + return env.fromElements(WORDS); + } +} diff --git a/integ/README.md b/integ/README.md new file mode 100644 index 00000000..82026304 --- /dev/null +++ b/integ/README.md @@ -0,0 +1,81 @@ +# Integration Tests + +This directory contains integration tests for the operator. These +tests involve running the operator against a real Kubernetes system to +validate its behavior. + +## Running the integration tests + +You will need a few things to run these tests. Firstly, you will need +a Kubernetes cluster and a kubeconfig file to talk to it. The easiest +way to get this is probably to install Docker for Mac (if on Mac) or +Minikube/microk8s on Linux. You will also need `kube proxy` running on +port 8001. + +The tests can run in two modes: direct and image. In direct mode, the +operator is run from the current source code from within the test. In +image mode the operator is submitted to Kubernetes as a deployment and +run from there. + +By default the tests create, use, and clean up the namespace +`flinkoperatortest`. + +These tests use a sample Flink job [operator-test-app](/integ/operator-test-app/). The +tests currently use two images built from here: + +* `lyft/operator-test-app:6c45caca225489895cb1353dae25069b5d43746f.1` +* `lyft/operator-test-app:6c45caca225489895cb1353dae25069b5d43746f.2` + +Those images are available on our private Dockerhub registry, and you +will either need to pull them locally or give Kubernetes access to the +registry. + +### Setup + +These tests create and mount a directory located at `/tmp/checkpoints` +into containers. You may need to configure this directory as a bind +mount. The tests also need to create this directory with +world-writable permissions. On linux this may require that you +run `umask 000` before running the tests. + +``` +$ kubectl proxy & +$ dep ensure +``` + +### Running in Direct mode + +(from within this directory) + +``` +$ INTEGRATION=true RUN_DIRECT=true go test +``` + +### Running in Image mode + +``` +$ INTEGRATION=true IMAGE={operator image} go test +``` + +Note that you will need to either build an image with tag flinkk8soperator:latest specify the operator image using the +`IMAGE` environment + +### Options + +The behavior of the tests are controlled via environment +variables. Supported options include: + +* `INTEGRATION` If not set, all integration tests will be skipped +* `KUBERNETES_CONFIG` Should point to your Kubernetes config file + (defaults to `~/.kube/config`) +* `NAMESPACE` The namespace to use for all Kubernetes resources + created by the tests. If set to default, the test framework will not + create or delete the namespace. +* `RUN_DIRECT` If set, will run the operator directly; otherwise will + submit run it via a deployment inside Kubernetes +* `IMAGE` The image to use for the operator when running in image + mode. By default, `lyft/flinkk8soperator:latest` + +You can also pass [gocheck](http://labix.org/gocheck) options to the +test runner. Particularly useful is `-check.vv` which will output logs +from the operator and Flink pods to help debugging test failures. diff --git a/integ/checkpoint_failure_test.go b/integ/checkpoint_failure_test.go new file mode 100644 index 00000000..d47f0845 --- /dev/null +++ b/integ/checkpoint_failure_test.go @@ -0,0 +1,84 @@ +package integ + +import ( + "fmt" + "io/ioutil" + "os" + "time" + + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "github.com/prometheus/common/log" + . "gopkg.in/check.v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func failingJobTest(s *IntegSuite, c *C, testName string, causeFailure func()) { + // create a Flink app + config, err := s.Util.ReadFlinkApplication("test_app.yaml") + c.Assert(err, IsNil, Commentf("Failed to read test app yaml")) + config.Name = testName + "job" + config.Spec.DeleteMode = "ForceCancel" + + config.ObjectMeta.Labels["integTest"] = testName + + c.Assert(s.Util.CreateFlinkApplication(config), IsNil, + Commentf("Failed to create flink application")) + + // Cause it to fail + causeFailure() + + c.Assert(s.Util.WaitForPhase(config.Name, v1alpha1.FlinkApplicationRunning, v1alpha1.FlinkApplicationDeployFailed), IsNil) + + // wait a bit for it to start failing + time.Sleep(5 * time.Second) + + // Try to update it + app, err := s.Util.GetFlinkApplication(config.Name) + c.Assert(err, IsNil) + app.Spec.Image = NewImage + _, err = s.Util.FlinkApps().Update(app) + c.Assert(err, IsNil) + + // because the checkpoint will fail, the app should move to deploy failed + c.Assert(s.Util.WaitForPhase(config.Name, v1alpha1.FlinkApplicationDeployFailed), IsNil) + + // And the job should not have been updated + newApp, err := s.Util.GetFlinkApplication(config.Name) + c.Assert(err, IsNil) + c.Assert(newApp.Status.JobStatus.JobID, Equals, app.Status.JobStatus.JobID) + + endpoint := fmt.Sprintf("jobs/%s", app.Status.JobStatus.JobID) + _, err = s.Util.FlinkAPIGet(app, endpoint) + c.Assert(err, IsNil) + + // delete the application and ensure everything is cleaned up successfully + c.Assert(s.Util.FlinkApps().Delete(app.Name, &v1.DeleteOptions{}), IsNil) + + for { + pods, err := s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). + List(v1.ListOptions{LabelSelector: "integTest=" + testName}) + c.Assert(err, IsNil) + if len(pods.Items) == 0 { + break + } + } + log.Info("All pods torn down") +} + +// Tests that we correctly handle updating a job with task failures +func (s *IntegSuite) TestJobWithTaskFailures(c *C) { + failingJobTest(s, c, "taskfailure", func() { + f, err := os.OpenFile(s.Util.CheckpointDir+"/fail", os.O_RDONLY|os.O_CREATE, 0666) + c.Assert(err, IsNil) + c.Assert(f.Close(), IsNil) + }) +} + +// Tests that we correctly handle updating a job with a checkpoint timeout +func (s *IntegSuite) TestCheckpointTimeout(c *C) { + failingJobTest(s, c, "checkpointtimeout", func() { + // cause checkpoints to take 120 seconds + err := ioutil.WriteFile(s.Util.CheckpointDir+"/checkpoint_delay", []byte("120000"), 0644) + c.Assert(err, IsNil) + }) +} diff --git a/integ/install.sh b/integ/install.sh new file mode 100755 index 00000000..2ead384b --- /dev/null +++ b/integ/install.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env sh + +set -e + +docker login -u "$DOCKER_REGISTRY_USERNAME" -p "$DOCKER_REGISTRY_PASSWORD" + +sudo snap install microk8s --classic --channel=1.12/stable + +sh boilerplate/lyft/golang_test_targets/dep_install.sh + +dep ensure diff --git a/integ/main_test.go b/integ/main_test.go new file mode 100644 index 00000000..d96a337c --- /dev/null +++ b/integ/main_test.go @@ -0,0 +1,144 @@ +package integ + +import ( + "fmt" + "os" + "path/filepath" + "testing" + "time" + + "github.com/lyft/flinkk8soperator/cmd/flinkk8soperator/cmd" + integFramework "github.com/lyft/flinkk8soperator/integ/utils" + controllerConfig "github.com/lyft/flinkk8soperator/pkg/controller/config" + flyteConfig "github.com/lyft/flytestdlib/config" + "github.com/prometheus/common/log" + . "gopkg.in/check.v1" + k8sErrors "k8s.io/apimachinery/pkg/api/errors" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/util/homedir" +) + +type IntegSuite struct { + Util *integFramework.TestUtil +} + +var _ = Suite(&IntegSuite{}) + +func Test(t *testing.T) { + TestingT(t) +} + +func (s *IntegSuite) SetUpSuite(c *C) { + // var namespace = flag.String("namespace", "flinkoperatortest", "namespace to use for testing") + var namespace = os.Getenv("NAMESPACE") + if namespace == "" { + namespace = "flinkoperatortest" + } + // var runDirect = flag.Bool("runDirect", false, "if set, runs the operator from the current source instead of from an image") + var runDirect = os.Getenv("RUN_DIRECT") != "" + // var image = flag.String("image", "lyft/flinkk8soperator:latest", "image for the operator") + var image = os.Getenv("OPERATOR_IMAGE") + if image == "" { + image = "flinkk8soperator:latest" + } + //var integration = flag.Bool("integration", false, "run integration tests") + var integration = os.Getenv("INTEGRATION") != "" + + if !integration { + // skip integration tests unless --integration is passed + c.Skip("--integration not provided") + return + } + + kubeconfig := os.Getenv("KUBERNETES_CONFIG") + if kubeconfig == "" { + kubeconfig = filepath.Join(homedir.HomeDir(), ".kube", "config") + err := os.Setenv("KUBERNETES_CONFIG", kubeconfig) + if err != nil { + c.Fatalf("Failed to set KUBERNETES_CONFIG env") + } + } + + checkpointDir := os.Getenv("CHECKPOINT_DIR") + if checkpointDir == "" { + checkpointDir = "/tmp/checkpoints" + } + + var err error + s.Util, err = integFramework.New(namespace, kubeconfig, image, checkpointDir) + if err != nil { + c.Fatalf("Failed to set up test util: %v", err) + } + + if err = s.Util.CreateCRD(); err != nil && !k8sErrors.IsAlreadyExists(err) { + c.Fatalf("Failed to create CRD: %v", err) + } + + if runDirect { + config := controllerConfig.Config{ + LimitNamespace: namespace, + UseProxy: true, + ResyncPeriod: flyteConfig.Duration{Duration: 3 * time.Second}, + StatemachineStalenessDuration: flyteConfig.Duration{Duration: 30 * time.Second}, + MetricsPrefix: "flinkk8soperator", + ProxyPort: flyteConfig.Port{Port: 8001}, + } + + log.Info("Running operator directly") + + go func() { + if err = cmd.Run(&config); err != nil { + c.Fatalf("Failed to run operator: %v", err) + } + }() + } else { + if err = s.Util.CreateOperator(); err != nil { + c.Fatalf("Failed to create operator: %v", err) + } + + if err = s.Util.TailOperatorLogs(); err != nil { + c.Fatalf("Failed to tail operator logs: %v", err) + } + } +} + +func (s *IntegSuite) TearDownSuite(c *C) { + if s != nil && s.Util != nil { + log.Info("Cleaning up") + s.Util.Cleanup() + } +} + +func (s *IntegSuite) SetUpTest(c *C) { + // create checkpoint directory + if _, err := os.Stat(s.Util.CheckpointDir); os.IsNotExist(err) { + c.Assert(os.Mkdir(s.Util.CheckpointDir, 0777), IsNil) + } +} + +func (s *IntegSuite) TearDownTest(c *C) { + jm, err := s.Util.GetJobManagerPod() + if err == nil { + fmt.Printf("\n\n######### JobManager logs for debugging #########\n---------------------------\n") + _ = s.Util.GetLogs(jm, nil) + } + + tms, err := s.Util.GetTaskManagerPods() + if err == nil { + for i, tm := range tms { + fmt.Printf("\n\n######### TaskManager %d logs for debugging "+ + "#########\n---------------------------\n", i) + _ = s.Util.GetLogs(tm, nil) + } + } + + err = s.Util.FlinkApps().DeleteCollection(nil, v1.ListOptions{}) + if err != nil { + log.Fatalf("Failed to clean up flink applications") + } + + err = os.RemoveAll(s.Util.CheckpointDir) + if err != nil { + log.Fatalf("Failed to clean up checkpoints directory: %v", err) + } +} diff --git a/integ/operator-test-app/Dockerfile b/integ/operator-test-app/Dockerfile new file mode 100644 index 00000000..5748489d --- /dev/null +++ b/integ/operator-test-app/Dockerfile @@ -0,0 +1,69 @@ +FROM openjdk:8-jdk + +# Prepare environment +ENV FLINK_HOME=/opt/flink +ENV MAVEN_HOME=/opt/maven +ENV HADOOP_HOME=/opt/hadoop +ENV PATH=$FLINK_HOME/bin:$HADOOP_HOME/bin:$MAVEN_HOME/bin:$PATH + +COPY . /code + +# Configure Flink version +ENV FLINK_VERSION=1.8.0 \ + HADOOP_SCALA_VARIANT=scala_2.11 + +# Install dependencies +RUN set -ex; \ + apt-get update; \ + apt-get -y install libsnappy1v5; \ + apt-get -y install netcat net-tools; \ + apt-get -y install gettext-base; \ + rm -rf /var/lib/apt/lists/* + +# Grab gosu for easy step-down from root +ENV GOSU_VERSION 1.7 +RUN set -ex; \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-$(dpkg --print-architecture)"; \ + wget -nv -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-$(dpkg --print-architecture).asc"; \ + export GNUPGHOME="$(mktemp -d)"; \ + rm -rf "$GNUPGHOME" /usr/local/bin/gosu.asc; \ + chmod +x /usr/local/bin/gosu; \ + gosu nobody true + +# Install Maven +RUN \ + wget https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.5.3/apache-maven-3.5.3-bin.tar.gz; \ + tar -zxvf apache-maven-3.5.3-bin.tar.gz; \ + mv apache-maven-3.5.3 $MAVEN_HOME; \ + rm apache-maven-3.5.3-bin.tar.gz + +WORKDIR /code + +RUN \ + mvn package; \ + mkdir -p /opt/flink/flink-web-upload; \ + cp flink-conf.yaml /usr/local/; \ + cp /code/target/operator-test-app-1.0.0-SNAPSHOT.jar /opt/flink/flink-web-upload/ + +RUN groupadd --system --gid=9999 flink && \ + useradd --system --home-dir $FLINK_HOME --uid=9999 --gid=flink flink +WORKDIR $FLINK_HOME + +ENV FLINK_URL_FILE_PATH=flink/flink-${FLINK_VERSION}/flink-${FLINK_VERSION}-bin-${HADOOP_SCALA_VARIANT}.tgz +ENV FLINK_TGZ_URL=https://mirrors.ocf.berkeley.edu/apache/$FLINK_URL_FILE_PATH + +# Install Flink +RUN set -ex; \ + wget -nv -O flink.tgz "$FLINK_TGZ_URL"; \ + \ + tar -xf flink.tgz --strip-components=1; \ + rm flink.tgz; \ + \ + chown -R flink:flink .; + +# control script expects manifest.yaml at this location +RUN chown -R flink:flink /var +COPY docker-entrypoint.sh / +ENTRYPOINT ["/docker-entrypoint.sh"] +EXPOSE 6123 8081 +CMD ["local"] diff --git a/integ/operator-test-app/docker-entrypoint.sh b/integ/operator-test-app/docker-entrypoint.sh new file mode 100755 index 00000000..7b336756 --- /dev/null +++ b/integ/operator-test-app/docker-entrypoint.sh @@ -0,0 +1,51 @@ +#!/bin/sh + +drop_privs_cmd() { + if [ -x /sbin/su-exec ]; then + # Alpine + echo su-exec + else + # Others + echo gosu + fi +} + + +envsubst < /usr/local/flink-conf.yaml > $FLINK_HOME/conf/flink-conf.yaml + +# As the taskmanager pods are accessible only by (cluster) ip address, +# we must manually configure this based on the podIp kubernetes +# variable, which is assigned to TASKMANAGER_HOSTNAME env var by the +# operator. +if [ -n "$TASKMANAGER_HOSTNAME" ]; then + echo "taskmanager.host: $TASKMANAGER_HOSTNAME" >> "$FLINK_HOME/conf/flink-conf.yaml" +fi + +# Add in extra configs set by the operator +if [ -n "$OPERATOR_FLINK_CONFIG" ]; then + echo "$OPERATOR_FLINK_CONFIG" >> "$FLINK_HOME/conf/flink-conf.yaml" +fi + +COMMAND=$@ + +if [ $# -lt 1 ]; then + COMMAND="local" +fi + +if [ "$COMMAND" = "help" ]; then + echo "Usage: $(basename "$0") (jobmanager|taskmanager|local|help)" + exit 0 +elif [ "$COMMAND" = "jobmanager" ]; then + echo "Starting Job Manager" + echo "config file: " && grep '^[^\n#]' "$FLINK_HOME/conf/flink-conf.yaml" + exec $(drop_privs_cmd) flink "$FLINK_HOME/bin/jobmanager.sh" start-foreground +elif [ "$COMMAND" = "taskmanager" ]; then + echo "Starting Task Manager" + echo "config file: " && grep '^[^\n#]' "$FLINK_HOME/conf/flink-conf.yaml" + exec $(drop_privs_cmd) flink "$FLINK_HOME/bin/taskmanager.sh" start-foreground +elif [ "$COMMAND" = "local" ]; then + echo "Starting local cluster" + exec $(drop_privs_cmd) flink "$FLINK_HOME/bin/jobmanager.sh" start-foreground local +fi + +exec "$@" diff --git a/integ/operator-test-app/flink-conf.yaml b/integ/operator-test-app/flink-conf.yaml new file mode 100644 index 00000000..24eb9bc2 --- /dev/null +++ b/integ/operator-test-app/flink-conf.yaml @@ -0,0 +1,30 @@ +jobmanager.web.submit.enable: true +jobmanager.web.log.path: /var/log/jobmanager/current + +jobmanager.web.upload.dir: /opt/flink + +taskmanager.log.path: /var/log/taskmanager/current +taskmanager.exit-on-fatal-akka-error: true +taskmanager.network.memory.max: 2147483648 +taskmanager.network.memory.fraction: 0.125 + +# Akka config +akka.framesize: 20MB +parallelism.default: 1 + +# State backend config +state.backend: rocksdb +state.checkpoints.num-retained: 4 + +# Restart strategy +restart-strategy: fixed-delay +restart-strategy.fixed-delay.delay: 0s +restart-strategy.fixed-delay.attempts: 2147483647 + +# These parameters control how often TaskManagers try to connect to a JobManager. +# These values are set a bit lower than the defaults to make recovery and cluster restarts +# a bit faster +taskmanager.maxRegistrationDuration: Inf +taskmanager.initial-registration-pause: 500 ms +taskmanager.max-registration-pause: 5 s +taskmanager.refused-registration-pause: 5 s diff --git a/integ/operator-test-app/pom.xml b/integ/operator-test-app/pom.xml new file mode 100644 index 00000000..e1f10ddf --- /dev/null +++ b/integ/operator-test-app/pom.xml @@ -0,0 +1,31 @@ + + + 4.0.0 + com.lyft + operator-test-app + 1.0.0-SNAPSHOT + jar + + + 1.8 + 1.8 + + + operator-test-app + + + + org.apache.flink + flink-java + 1.8.0 + + + org.apache.flink + flink-streaming-java_2.11 + 1.8.0 + + + + diff --git a/integ/operator-test-app/src/main/java/com/lyft/OperatorTestApp.java b/integ/operator-test-app/src/main/java/com/lyft/OperatorTestApp.java new file mode 100644 index 00000000..c423aff8 --- /dev/null +++ b/integ/operator-test-app/src/main/java/com/lyft/OperatorTestApp.java @@ -0,0 +1,147 @@ +package com.lyft; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.common.typeinfo.TypeHint; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.streaming.api.CheckpointingMode; +import org.apache.flink.streaming.api.TimeCharacteristic; +import org.apache.flink.streaming.api.checkpoint.ListCheckpointed; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.api.environment.CheckpointConfig; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; +import org.apache.flink.streaming.api.windowing.time.Time; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class OperatorTestApp { + public static class StreamingImpulseSource extends RichParallelSourceFunction implements + ListCheckpointed { + Logger log = LoggerFactory.getLogger(StreamingImpulseSource.class); + + private final AtomicBoolean cancelled = new AtomicBoolean(false); + private long count = 0; + private final int intervalMillis; + + public StreamingImpulseSource(int intervalMillis) { + this.intervalMillis = intervalMillis; + } + + @Override + public void run(SourceContext ctx) throws IOException { + while (!cancelled.get()) { + synchronized (ctx.getCheckpointLock()) { + ctx.collect(count++); + } + + try { + if (intervalMillis > 0) { + Thread.sleep(intervalMillis); + } + } catch (InterruptedException e) { + // pass + } + } + + } + + @Override + public void cancel() { + this.cancelled.set(true); + } + + @Override + public List snapshotState(long checkpointId, long timestamp) throws Exception { + File file = new File("/checkpoints/checkpoint_delay"); + if (file.exists()) { + String checkpointDelay = new String(Files.readAllBytes(file.toPath())) + .replaceAll("\n", ""); + int delay = Integer.valueOf(checkpointDelay); + log.info("Waiting {} milliseconds", delay); + System.out.println(String.format("PRINT Waiting %d milliseconds", delay)); + + try { + Thread.sleep(delay); + } catch (InterruptedException e) { + log.error("Interrupted", e); + } + } + + return Collections.singletonList(count); + } + + @Override + public void restoreState(List state) throws Exception { + if (!state.isEmpty()) { + count = state.get(0); + } + } + } + + public static class MaybeFail implements MapFunction { + + @Override + public Long map(Long x) throws Exception { + if (new File("/checkpoints/fail").exists()) { + throw new RuntimeException("FAILED!!!"); + } + + return x; + } + } + + public static void main(String[] args) throws Exception { + Logger log = LoggerFactory.getLogger(OperatorTestApp.class); + + log.info("Submitting job..."); + + String uid = "default"; + if (args.length > 0) { + uid = args[0]; + } + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + configureEnvironment(env); + + SingleOutputStreamOperator dataStream = env + .addSource(new StreamingImpulseSource(1000)) + .map(new MaybeFail()) + .map(x -> Tuple2.of(0, x)) + .returns(TypeInformation.of(new TypeHint>(){})) + .keyBy(0) + .timeWindow(Time.seconds(10)) + .max(1) + .uid(uid) + .map(x -> x.f1); + + dataStream.print(); + + env.execute("Window Count"); + } + + private static void configureEnvironment(StreamExecutionEnvironment env) { + env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE); + env.getCheckpointConfig().setCheckpointTimeout(10_000); + env.enableCheckpointing(5_000); + env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime); + + if (System.getenv("EXTERNAL_CHECKPOINT") != null) { + env.getCheckpointConfig() + .enableExternalizedCheckpoints( + CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); + } + + // It is normally safe to use this setting and it can be a big performance improvement as it + // skips a per-event serializer copy. The caveat is that you must treat your data objects as + // immutable. + env.getConfig().enableObjectReuse(); + } +} diff --git a/integ/setup.sh b/integ/setup.sh new file mode 100755 index 00000000..4abbc57e --- /dev/null +++ b/integ/setup.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +microk8s.start +microk8s.status --wait-ready +microk8s.enable dns + +microk8s.kubectl proxy --port 8001 & + +# Enable our private docker registry +# TODO: remove for open source +microk8s.kubectl create secret docker-registry dockerhub \ + --docker-server=docker.io \ + --docker-username=$DOCKER_REGISTRY_USERNAME \ + --docker-password=$DOCKER_REGISTRY_PASSWORD \ + --docker-email=none + +microk8s.kubectl config view > ~/.kube/config diff --git a/integ/simple_test.go b/integ/simple_test.go new file mode 100644 index 00000000..0eeb4137 --- /dev/null +++ b/integ/simple_test.go @@ -0,0 +1,312 @@ +package integ + +import ( + "encoding/json" + "fmt" + + "os" + "time" + + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "github.com/lyft/flinkk8soperator/pkg/controller/flink/client" + "github.com/prometheus/common/log" + . "gopkg.in/check.v1" + corev1 "k8s.io/api/core/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const NewImage = "lyft/operator-test-app:6c45caca225489895cb1353dae25069b5d43746f.2" + +func updateAndValidate(c *C, s *IntegSuite, name string, updateFn func(app *v1alpha1.FlinkApplication), failurePhase v1alpha1.FlinkApplicationPhase) *v1alpha1.FlinkApplication { + app, err := s.Util.GetFlinkApplication(name) + c.Assert(err, IsNil) + + // Update the app + updateFn(app) + + _, err = s.Util.FlinkApps().Update(app) + c.Assert(err, IsNil) + + c.Assert(s.Util.WaitForPhase(name, v1alpha1.FlinkApplicationSavepointing, failurePhase), IsNil) + c.Assert(s.Util.WaitForPhase(name, v1alpha1.FlinkApplicationRunning, failurePhase), IsNil) + c.Assert(s.Util.WaitForAllTasksInState(name, "RUNNING"), IsNil) + + // check that it really updated + newApp, err := s.Util.GetFlinkApplication(name) + c.Assert(err, IsNil) + c.Assert(newApp.Status.JobStatus.JobID, Not(Equals), app.Status.JobStatus.JobID) + + log.Info("New job started successfully") + + // check that we savepointed and restored correctly + endpoint := fmt.Sprintf("jobs/%s/checkpoints", newApp.Status.JobStatus.JobID) + res, err := s.Util.FlinkAPIGet(newApp, endpoint) + c.Assert(err, IsNil) + + body := res.(map[string]interface{}) + restored := (body["latest"].(map[string]interface{}))["restored"] + c.Assert(restored, NotNil) + + c.Assert(restored.(map[string]interface{})["is_savepoint"], Equals, true) + + // wait for the old cluster to be cleaned up + for { + pods, err := s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). + List(v1.ListOptions{LabelSelector: "flink-app-hash=" + app.Status.DeployHash}) + c.Assert(err, IsNil) + if len(pods.Items) == 0 { + break + } + time.Sleep(100 * time.Millisecond) + } + + return newApp +} + +// Tests job submission, upgrade, rollback, and deletion +func (s *IntegSuite) TestSimple(c *C) { + const finalizer = "simple.finalizers.test.com" + + // start a simple app + config, err := s.Util.ReadFlinkApplication("test_app.yaml") + c.Assert(err, IsNil, Commentf("Failed to read test app yaml")) + + config.ObjectMeta.Labels["integTest"] = "test_simple" + // add a finalizer so that the flinkapplication won't be deleted until we've had a chance to look at it + config.Finalizers = append(config.Finalizers, finalizer) + + c.Assert(s.Util.CreateFlinkApplication(config), IsNil, + Commentf("Failed to create flink application")) + + c.Assert(s.Util.WaitForPhase(config.Name, v1alpha1.FlinkApplicationRunning, v1alpha1.FlinkApplicationDeployFailed), IsNil) + c.Assert(s.Util.WaitForAllTasksInState(config.Name, "RUNNING"), IsNil) + + pods, err := s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). + List(v1.ListOptions{LabelSelector: "integTest=test_simple"}) + c.Assert(err, IsNil) + c.Assert(len(pods.Items), Equals, 3) + for _, pod := range pods.Items { + c.Assert(pod.Spec.Containers[0].Image, Equals, config.Spec.Image) + } + + log.Info("Application started successfully") + + // test updating the app with a new image + newApp := updateAndValidate(c, s, config.Name, func(app *v1alpha1.FlinkApplication) { + app.Spec.Image = NewImage + }, v1alpha1.FlinkApplicationDeployFailed) + // check that the pods have the new image + c.Assert(newApp.Spec.Image, Equals, NewImage) + pods, err = s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). + List(v1.ListOptions{LabelSelector: "integTest=test_simple"}) + c.Assert(err, IsNil) + c.Assert(len(pods.Items), Equals, 3) + for _, pod := range pods.Items { + c.Assert(pod.Spec.Containers[0].Image, Equals, NewImage) + } + + // test updating the app with a config change + newApp = updateAndValidate(c, s, config.Name, func(app *v1alpha1.FlinkApplication) { + app.Spec.FlinkConfig["akka.client.timeout"] = "23 s" + }, v1alpha1.FlinkApplicationDeployFailed) + // validate the config has been applied + res, err := s.Util.FlinkAPIGet(newApp, "/jobmanager/config") + c.Assert(err, IsNil) + body := res.([]interface{}) + value := func() interface{} { + for _, e := range body { + kv := e.(map[string]interface{}) + if kv["key"] == "akka.client.timeout" { + return kv["value"] + } + } + return nil + }() + c.Assert(value, Equals, "23 s") + + // Test updating the app with a bad jar name -- this should cause a failed deploy and roll back + + { + newApp, err := s.Util.GetFlinkApplication(config.Name) + c.Assert(err, IsNil) + newApp.Spec.JarName = "nonexistent.jar" + // this shouldn't be needed after STRMCMP-473 is fixed + newApp.Spec.RestartNonce = "rollback" + _, err = s.Util.FlinkApps().Update(newApp) + c.Assert(err, IsNil) + + c.Assert(s.Util.WaitForPhase(newApp.Name, v1alpha1.FlinkApplicationSavepointing, ""), IsNil) + // we should end up in the DeployFailed phase + c.Assert(s.Util.WaitForPhase(newApp.Name, v1alpha1.FlinkApplicationDeployFailed, ""), IsNil) + + log.Info("Job is in deploy failed, waiting for tasks to start") + + // but the job should have been resubmitted + c.Assert(s.Util.WaitForAllTasksInState(newApp.Name, "RUNNING"), IsNil) + + // the job id should have changed + jobID := newApp.Status.JobStatus.JobID + newApp, err = s.Util.GetFlinkApplication(newApp.Name) + c.Assert(err, IsNil) + c.Assert(newApp.Status.JobStatus.JobID, Not(Equals), jobID) + + // we should have restored from our savepoint + endpoint := fmt.Sprintf("jobs/%s/checkpoints", newApp.Status.JobStatus.JobID) + res, err := s.Util.FlinkAPIGet(newApp, endpoint) + c.Assert(err, IsNil) + + body := res.(map[string]interface{}) + restored := (body["latest"].(map[string]interface{}))["restored"] + c.Assert(restored, NotNil) + + c.Assert(restored.(map[string]interface{})["is_savepoint"], Equals, true) + + log.Info("Attempting to roll forward") + + // and we should be able to roll forward by resubmitting with a fixed config + updateAndValidate(c, s, config.Name, func(app *v1alpha1.FlinkApplication) { + app.Spec.JarName = config.Spec.JarName + app.Spec.RestartNonce = "rollback2" + }, "") + } + + // delete the application and ensure everything is cleaned up successfully + c.Assert(s.Util.FlinkApps().Delete(config.Name, &v1.DeleteOptions{}), IsNil) + + // validate that a savepoint was taken and the job was cancelled + var app *v1alpha1.FlinkApplication + for { + app, err = s.Util.GetFlinkApplication(config.Name) + c.Assert(err, IsNil) + + if len(app.Finalizers) == 1 && app.Finalizers[0] == finalizer { + break + } + time.Sleep(100 * time.Millisecond) + } + + c.Assert(app.Spec.SavepointInfo.SavepointLocation, NotNil) + job := func() map[string]interface{} { + jobs, _ := s.Util.FlinkAPIGet(app, "/jobs") + jobMap := jobs.(map[string]interface{}) + jobList := jobMap["jobs"].([]interface{}) + for _, j := range jobList { + job := j.(map[string]interface{}) + if job["id"] == app.Status.JobStatus.JobID { + return job + } + } + return nil + }() + + fmt.Printf("test job = %v", job) + c.Assert(job["status"], Equals, "CANCELED") + + // delete our finalizer + app.Finalizers = []string{} + _, err = s.Util.FlinkApps().Update(app) + c.Assert(err, IsNil) + + // wait until all pods are gone + for { + pods, err = s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). + List(v1.ListOptions{LabelSelector: "integTest=test_simple"}) + c.Assert(err, IsNil) + if len(pods.Items) == 0 { + break + } + time.Sleep(100 * time.Millisecond) + } + log.Info("All pods torn down") +} + +func (s *IntegSuite) TestRecovery(c *C) { + config, err := s.Util.ReadFlinkApplication("test_app.yaml") + c.Assert(err, IsNil, Commentf("Failed to read test app yaml")) + + config.Name = "testrecoveryjob" + config.ObjectMeta.Labels["integTest"] = "test_recovery" + envVar := corev1.EnvVar{ + Name: "EXTERNAL_CHECKPOINT", + Value: "1", + } + + config.Spec.JobManagerConfig.Environment.Env = + append(config.Spec.JobManagerConfig.Environment.Env, envVar) + config.Spec.TaskManagerConfig.Environment.Env = + append(config.Spec.TaskManagerConfig.Environment.Env, envVar) + + c.Assert(s.Util.CreateFlinkApplication(config), IsNil, + Commentf("Failed to create flink application")) + + c.Log("Application Created") + + // wait for it to be running + c.Assert(s.Util.WaitForPhase(config.Name, v1alpha1.FlinkApplicationRunning, v1alpha1.FlinkApplicationDeployFailed), IsNil) + c.Assert(s.Util.WaitForAllTasksInState(config.Name, "RUNNING"), IsNil) + + c.Log("Application running") + + // wait for checkpoints + app, err := s.Util.GetFlinkApplication(config.Name) + c.Assert(err, IsNil) + + endpoint := fmt.Sprintf("jobs/%s/checkpoints", app.Status.JobStatus.JobID) + for { + res, err := s.Util.FlinkAPIGet(app, endpoint) + c.Assert(err, IsNil) + + body, err := json.Marshal(res) + c.Assert(err, IsNil) + + var checkpoints client.CheckpointResponse + err = json.Unmarshal(body, &checkpoints) + c.Assert(err, IsNil) + + if checkpoints.Latest.Completed != nil { + c.Logf("Checkpoint created %s", checkpoints.Latest.Completed.ExternalPath) + break + } + } + + // cause the app to start failing + f, err := os.OpenFile(s.Util.CheckpointDir+"/fail", os.O_RDONLY|os.O_CREATE, 0666) + c.Assert(err, IsNil) + c.Assert(f.Close(), IsNil) + + // wait a bit + time.Sleep(1 * time.Second) + + // try to update the job + app.Spec.Image = NewImage + _, err = s.Util.FlinkApps().Update(app) + + for { + // wait until the new job is launched + newApp, err := s.Util.GetFlinkApplication(config.Name) + c.Assert(err, IsNil) + if newApp.Status.JobStatus.JobID != app.Status.JobStatus.JobID { + break + } + time.Sleep(100 * time.Millisecond) + } + + c.Assert(err, IsNil) + c.Assert(s.Util.WaitForPhase(config.Name, v1alpha1.FlinkApplicationRunning, v1alpha1.FlinkApplicationDeployFailed), IsNil) + + // stop it from failing + c.Assert(os.Remove(s.Util.CheckpointDir+"/fail"), IsNil) + c.Assert(s.Util.WaitForAllTasksInState(config.Name, "RUNNING"), IsNil) + + // delete the application + c.Assert(s.Util.FlinkApps().Delete(config.Name, &v1.DeleteOptions{}), IsNil) + for { + pods, err := s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). + List(v1.ListOptions{LabelSelector: "integTest=test_recovery"}) + c.Assert(err, IsNil) + if len(pods.Items) == 0 { + break + } + } + log.Info("All pods torn down") +} diff --git a/integ/test.sh b/integ/test.sh new file mode 100755 index 00000000..daf41c70 --- /dev/null +++ b/integ/test.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -e + +export NAMESPACE=default +export OPERATOR_IMAGE=lyft/flinkk8soperator:$(git rev-parse HEAD) +export INTEGRATION=true + +# needed to create the checkpoints directory with world-writable permissions +umask 000 + +cd $(dirname "$0") +go test -timeout 20m -check.vv IntegSuite + diff --git a/integ/test_app.yaml b/integ/test_app.yaml new file mode 100644 index 00000000..a1586faa --- /dev/null +++ b/integ/test_app.yaml @@ -0,0 +1,41 @@ +apiVersion: flink.k8s.io/v1alpha1 +kind: FlinkApplication +metadata: + name: operator-test-app + annotations: + labels: + environment: development +spec: + image: lyft/operator-test-app:6c45caca225489895cb1353dae25069b5d43746f.1 + imagePullSecrets: + - name: dockerhub + flinkConfig: + taskmanager.heap.size: 200 + state.backend.fs.checkpointdir: file:///checkpoints/flink/checkpoints + state.checkpoints.dir: file:///checkpoints/flink/externalized-checkpoints + state.savepoints.dir: file:///checkpoints/flink/savepoints + jobManagerConfig: + resources: + requests: + memory: "200Mi" + cpu: "0.2" + replicas: 1 + taskManagerConfig: + taskSlots: 2 + resources: + requests: + memory: "200Mi" + cpu: "0.2" + volumeMounts: + - mountPath: /checkpoints + name: checkpoints + volumes: + - name: checkpoints + hostPath: + path: /tmp/checkpoints + type: Directory + flinkVersion: "1.8" + deployType: Dual + jarName: "operator-test-app-1.0.0-SNAPSHOT.jar" + parallelism: 3 + entryClass: "com.lyft.OperatorTestApp" diff --git a/integ/utils/utils.go b/integ/utils/utils.go new file mode 100644 index 00000000..2e716d08 --- /dev/null +++ b/integ/utils/utils.go @@ -0,0 +1,440 @@ +package utils + +import ( + "encoding/json" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "strings" + "time" + + "github.com/go-resty/resty" + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + v1alpha12 "github.com/lyft/flinkk8soperator/pkg/client/clientset/versioned/typed/app/v1alpha1" + "github.com/prometheus/common/log" + appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" + "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1beta1" + apiextensionsClientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/yaml" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" +) +import clientset "github.com/lyft/flinkk8soperator/pkg/client/clientset/versioned" + +type TestUtil struct { + KubeClient kubernetes.Interface + FlinkApplicationClient clientset.Interface + APIExtensionsClient apiextensionsClientset.Interface + Namespace *v1.Namespace + Image string + CheckpointDir string +} + +func New(namespaceName string, kubeconfig string, image string, checkpointDir string) (*TestUtil, error) { + config, err := clientcmd.BuildConfigFromFlags("", kubeconfig) + if err != nil { + return nil, err + } + + client, err := kubernetes.NewForConfig(config) + if err != nil { + return nil, err + } + + var namespace *v1.Namespace + if namespaceName == "default" { + namespace, err = client.CoreV1().Namespaces().Get("default", metav1.GetOptions{}) + if err != nil { + return nil, err + } + } else { + namespace, err = client.CoreV1().Namespaces().Create(&v1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: namespaceName, + }, + }) + if err != nil { + return nil, err + } + } + + crdClient, err := clientset.NewForConfig(config) + if err != nil { + return nil, err + } + + apiextensionsClient, err := apiextensionsClientset.NewForConfig(config) + if err != nil { + return nil, err + } + + return &TestUtil{ + KubeClient: client, + FlinkApplicationClient: crdClient, + APIExtensionsClient: apiextensionsClient, + Namespace: namespace, + Image: image, + CheckpointDir: checkpointDir, + }, nil +} + +func (f *TestUtil) Cleanup() { + if f.Namespace.Name != "default" { + flinkApps, err := f.FlinkApps().List(metav1.ListOptions{}) + if err != nil { + log.Errorf("Failed to fetch flink apps during cleanup: %v", err) + } else { + // make sure none of the apps have left-over finalizers + for _, app := range flinkApps.Items { + if len(app.Finalizers) != 0 { + app.Finalizers = []string{} + _, _ = f.FlinkApps().Update(&app) + } + } + } + + err = f.KubeClient.CoreV1().Namespaces().Delete(f.Namespace.Name, &metav1.DeleteOptions{}) + if err != nil { + log.Errorf("Failed to clean up after test: %v", err) + } + } +} + +func getFile(relativePath string) (*os.File, error) { + path, err := filepath.Abs(relativePath) + if err != nil { + return nil, err + } + + return os.Open(path) +} + +func (f *TestUtil) CreateCRD() error { + file, err := getFile("../deploy/crd.yaml") + if err != nil { + return err + } + + crd := v1beta1.CustomResourceDefinition{} + err = yaml.NewYAMLOrJSONDecoder(file, 1024).Decode(&crd) + if err != nil { + return err + } + + crd.Namespace = f.Namespace.Name + fmt.Printf("crd %v", crd) + + _, err = f.APIExtensionsClient.ApiextensionsV1beta1().CustomResourceDefinitions().Create(&crd) + if err != nil { + return err + } + + return nil +} + +func (f *TestUtil) CreateOperator() error { + configValue := make(map[string]string) + configValue["development"] = "operator:\n containerNameFormat: \"%s-unknown\"\n statemachineStalenessDuration: 40s\n resyncPeriod: 5s" + + configMap := v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "flink-operator-config", + Namespace: f.Namespace.Name, + }, + Data: configValue, + } + + if _, err := f.KubeClient.CoreV1().ConfigMaps(f.Namespace.Name).Create(&configMap); err != nil { + return err + } + + var replicas int32 = 1 + resources := make(map[v1.ResourceName]resource.Quantity) + resources[v1.ResourceCPU] = resource.MustParse("0.2") + resources[v1.ResourceMemory] = resource.MustParse("0.5Gi") + deployment := appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "flinkk8soperatore", + Namespace: f.Namespace.Name, + }, + Spec: appsv1.DeploymentSpec{ + Replicas: &replicas, + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "flinkk8soperator", + }, + }, + Template: v1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "app": "flinkk8soperator", + }, + }, + Spec: v1.PodSpec{ + Volumes: []v1.Volume{ + { + Name: "config-volume", + VolumeSource: v1.VolumeSource{ + ConfigMap: &v1.ConfigMapVolumeSource{ + LocalObjectReference: v1.LocalObjectReference{ + Name: "flink-operator-config", + }, + Items: []v1.KeyToPath{ + { + Key: "development", + Path: "config.yaml", + }, + }, + }, + }, + }, + }, + Containers: []v1.Container{ + { + Name: "flinkk8soperator", + Image: f.Image, + Env: []v1.EnvVar{ + {Name: "OPERATOR_NAME", Value: "flinkk8soperator"}, + }, + Command: []string{"flinkoperator"}, + Args: []string{"--config", "/etc/flinkk8soperator/config/config.yaml"}, + Ports: []v1.ContainerPort{ + {ContainerPort: 10254}, + }, + Resources: v1.ResourceRequirements{ + Requests: resources, + }, + VolumeMounts: []v1.VolumeMount{ + {Name: "config-volume", MountPath: "/etc/flinkk8soperator/config"}, + }, + }, + }, + ImagePullSecrets: []v1.LocalObjectReference{ + {Name: "dockerhub"}, + }, + }, + }, + }, + } + + if _, err := f.KubeClient.AppsV1().Deployments(f.Namespace.Name).Create(&deployment); err != nil { + return err + } + + return nil +} + +func (f *TestUtil) GetJobManagerPod() (string, error) { + pods, err := f.KubeClient.CoreV1().Pods(f.Namespace.Name).List(metav1.ListOptions{}) + if err != nil { + return "", err + } + + for _, p := range pods.Items { + if strings.Contains(p.Name, "-jm-") { + return p.Name, nil + } + } + + return "", errors.New("no jobmanager pod found") +} + +func (f *TestUtil) GetTaskManagerPods() ([]string, error) { + tms := make([]string, 0) + pods, err := f.KubeClient.CoreV1().Pods(f.Namespace.Name).List(metav1.ListOptions{}) + + if err != nil { + return tms, err + } + + for _, p := range pods.Items { + if strings.Contains(p.Name, "-tm-") { + tms = append(tms, p.Name) + } + } + + return tms, nil +} + +func (f *TestUtil) GetLogs(podName string, lines *int64) error { + req := f.KubeClient.CoreV1().Pods(f.Namespace.Name). + GetLogs(podName, + &v1.PodLogOptions{ + TailLines: lines, + Follow: false, + }) + + readCloser, err := req.Stream() + if err != nil { + return err + } + + defer readCloser.Close() + _, err = io.Copy(os.Stdout, readCloser) + + if err != nil { + return err + } + + return nil +} + +func (f *TestUtil) TailOperatorLogs() error { + var podName string + for { + pods, err := f.KubeClient.CoreV1().Pods(f.Namespace.Name).List(metav1.ListOptions{ + LabelSelector: "app=flinkk8soperator", + }) + + if err != nil { + return err + } + + if len(pods.Items) == 0 || len(pods.Items[0].Status.ContainerStatuses) == 0 || !pods.Items[0].Status.ContainerStatuses[0].Ready { + time.Sleep(500 * time.Millisecond) + log.Info("Waiting for operator container to be ready...") + } else { + podName = pods.Items[0].Name + break + } + } + + log.Infof("Found operator pod %s, starting to tail logs...", podName) + + req := f.KubeClient.CoreV1().RESTClient().Get(). + Namespace(f.Namespace.Name). + Name(podName). + Resource("pods"). + SubResource("log"). + Param("follow", "true") + + readerCloser, err := req.Stream() + if err != nil { + return err + } + + go func() { + defer readerCloser.Close() + _, err = io.Copy(os.Stderr, readerCloser) + if err != nil { + log.Errorf("Lost connection to operator logs %v", err) + } + }() + + return nil +} + +func (f *TestUtil) ReadFlinkApplication(path string) (*v1alpha1.FlinkApplication, error) { + file, err := getFile(path) + if err != nil { + return nil, err + } + + app := v1alpha1.FlinkApplication{} + err = yaml.NewYAMLOrJSONDecoder(file, 2048).Decode(&app) + if err != nil { + return nil, err + } + + app.Spec.Volumes[0].HostPath.Path = f.CheckpointDir + + return &app, nil +} + +func (f *TestUtil) FlinkApps() v1alpha12.FlinkApplicationInterface { + return f.FlinkApplicationClient.FlinkV1alpha1().FlinkApplications(f.Namespace.Name) +} + +func (f *TestUtil) CreateFlinkApplication(application *v1alpha1.FlinkApplication) error { + _, err := f.FlinkApps().Create(application) + return err +} + +func (f *TestUtil) GetFlinkApplication(name string) (*v1alpha1.FlinkApplication, error) { + return f.FlinkApps().Get(name, metav1.GetOptions{}) +} + +func (f *TestUtil) WaitForPhase(name string, phase v1alpha1.FlinkApplicationPhase, failurePhases ...v1alpha1.FlinkApplicationPhase) error { + for { + app, err := f.FlinkApps().Get(name, metav1.GetOptions{}) + + if err != nil { + return err + } + + if app.Status.Phase == phase { + return nil + } + + for _, p := range failurePhases { + if app.Status.Phase == p { + return fmt.Errorf("application entered %s phase", p) + } + } + + time.Sleep(200 * time.Millisecond) + } +} + +func (f *TestUtil) FlinkAPIGet(app *v1alpha1.FlinkApplication, endpoint string) (interface{}, error) { + url := fmt.Sprintf("http://localhost:8001/api/v1/namespaces/%s/"+ + "services/%s:8081/proxy/%s", + f.Namespace.Name, app.Name, endpoint) + + resp, err := resty.SetRedirectPolicy(resty.FlexibleRedirectPolicy(5)).R().Get(url) + if err != nil { + return nil, err + } + + if !resp.IsSuccess() { + return nil, fmt.Errorf("request failed with code %d", resp.StatusCode()) + } + + var result interface{} + err = json.Unmarshal(resp.Body(), &result) + if err != nil { + return nil, err + } + + return result, nil +} + +func (f *TestUtil) WaitForAllTasksInState(name string, state string) error { + flinkApp, err := f.GetFlinkApplication(name) + if err != nil { + return err + } + + endpoint := fmt.Sprintf("jobs/%s", flinkApp.Status.JobStatus.JobID) + for { + res, err := f.FlinkAPIGet(flinkApp, endpoint) + if err != nil { + return err + } + + body := res.(map[string]interface{}) + vertices := body["vertices"].([]interface{}) + + var allRunning = true + for _, vertex := range vertices { + allRunning = allRunning && (vertex.(map[string]interface{})["status"] == state) + } + + if allRunning && len(vertices) > 0 { + break + } + + time.Sleep(100 * time.Millisecond) + } + + // wait a little bit longer, as sometimes the flink api reports tasks as running + // just before they actually are + time.Sleep(5 * time.Second) + + return nil +} diff --git a/local_config.yaml b/local_config.yaml new file mode 100644 index 00000000..80f74aff --- /dev/null +++ b/local_config.yaml @@ -0,0 +1,16 @@ +# This is default configuration file. +# Real configuration when running inside K8s (local or otherwise) lives in a ConfigMap +# The operator will replace "job" field with the correct flink job name +# ingressUrlFormat: "{{$jobCluster}}.xyz.net" +operator: + useKubectlProxy: true + containerNameFormat: "%s-unknown" + statemachineStalenessDuration: 1m + metricsPrefix: "flinkk8soperator" + resyncPeriod: 10s + proxyPort: 8001 +logger: + show-source: true + level: 5 + formatter: + type: text \ No newline at end of file diff --git a/pkg/apis/app/addtoscheme_v1alpha1.go b/pkg/apis/app/addtoscheme_v1alpha1.go new file mode 100644 index 00000000..efa5f0e9 --- /dev/null +++ b/pkg/apis/app/addtoscheme_v1alpha1.go @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2018 Lyft. All rights reserved. + */ + +package apis + +import ( + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" +) + +func init() { + // Register the types with the Scheme so the components can map objects to GroupVersionKinds and back + AddToSchemes = append(AddToSchemes, v1alpha1.SchemeBuilder.AddToScheme) +} diff --git a/pkg/apis/app/apis.go b/pkg/apis/app/apis.go new file mode 100644 index 00000000..07dc9616 --- /dev/null +++ b/pkg/apis/app/apis.go @@ -0,0 +1,13 @@ +package apis + +import ( + "k8s.io/apimachinery/pkg/runtime" +) + +// AddToSchemes may be used to add all resources defined in the project to a Scheme +var AddToSchemes runtime.SchemeBuilder + +// AddToScheme adds all Resources to the Scheme +func AddToScheme(s *runtime.Scheme) error { + return AddToSchemes.AddToScheme(s) +} diff --git a/pkg/apis/app/v1alpha1/doc.go b/pkg/apis/app/v1alpha1/doc.go new file mode 100644 index 00000000..57abdd27 --- /dev/null +++ b/pkg/apis/app/v1alpha1/doc.go @@ -0,0 +1,3 @@ +// +k8s:deepcopy-gen=package +// +groupName=flink.k8s.io +package v1alpha1 diff --git a/pkg/apis/app/v1alpha1/register.go b/pkg/apis/app/v1alpha1/register.go new file mode 100644 index 00000000..11431c1c --- /dev/null +++ b/pkg/apis/app/v1alpha1/register.go @@ -0,0 +1,42 @@ +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +const ( + version = "v1alpha1" + groupName = "flink.k8s.io" + + FlinkApplicationKind = "FlinkApplication" +) + +var ( + SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) + AddToScheme = SchemeBuilder.AddToScheme + // SchemeGroupVersion is the group version used to register these objects. + SchemeGroupVersion = schema.GroupVersion{Group: groupName, Version: version} +) + +// GetKind takes an unqualified kind and returns back a Group qualified GroupKind +func Kind(kind string) schema.GroupKind { + return SchemeGroupVersion.WithKind(kind).GroupKind() +} + +// Resource takes an unqualified resource and returns a Group qualified GroupResource +func Resource(resource string) schema.GroupResource { + return SchemeGroupVersion.WithResource(resource).GroupResource() +} + +// addKnownTypes adds the set of types defined in this package to the supplied scheme. +func addKnownTypes(scheme *runtime.Scheme) error { + scheme.AddKnownTypes(SchemeGroupVersion, + &FlinkApplication{}, + &FlinkApplicationList{}, + ) + + metav1.AddToGroupVersion(scheme, SchemeGroupVersion) + return nil +} diff --git a/pkg/apis/app/v1alpha1/types.go b/pkg/apis/app/v1alpha1/types.go new file mode 100644 index 00000000..5d6318fb --- /dev/null +++ b/pkg/apis/app/v1alpha1/types.go @@ -0,0 +1,257 @@ +package v1alpha1 + +import ( + "fmt" + + apiv1 "k8s.io/api/core/v1" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +type FlinkApplicationList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata"` + Items []FlinkApplication `json:"items"` +} + +// +genclient +// +genclient:noStatus +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// +k8s:defaulter-gen=true +type FlinkApplication struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata"` + Spec FlinkApplicationSpec `json:"spec"` + Status FlinkApplicationStatus `json:"status,omitempty"` +} + +type FlinkApplicationSpec struct { + Image string `json:"image,omitempty" protobuf:"bytes,2,opt,name=image"` + ImagePullPolicy apiv1.PullPolicy `json:"imagePullPolicy,omitempty" protobuf:"bytes,14,opt,name=imagePullPolicy,casttype=PullPolicy"` + ImagePullSecrets []apiv1.LocalObjectReference `json:"imagePullSecrets,omitempty" patchStrategy:"merge" patchMergeKey:"name" protobuf:"bytes,15,rep,name=imagePullSecrets"` + FlinkConfig FlinkConfig `json:"flinkConfig"` + FlinkVersion string `json:"flinkVersion"` + TaskManagerConfig TaskManagerConfig `json:"taskManagerConfig,omitempty"` + JobManagerConfig JobManagerConfig `json:"jobManagerConfig,omitempty"` + JarName string `json:"jarName"` + Parallelism int32 `json:"parallelism"` + EntryClass string `json:"entryClass,omitempty"` + ProgramArgs string `json:"programArgs,omitempty"` + SavepointInfo SavepointInfo `json:"savepointInfo,omitempty"` + DeploymentMode DeploymentMode `json:"deploymentMode"` + RPCPort *int32 `json:"rpcPort,omitempty"` + BlobPort *int32 `json:"blobPort,omitempty"` + QueryPort *int32 `json:"queryPort,omitempty"` + UIPort *int32 `json:"uiPort,omitempty"` + MetricsQueryPort *int32 `json:"metricsQueryPort,omitempty"` + Volumes []apiv1.Volume `json:"volumes,omitempty"` + VolumeMounts []apiv1.VolumeMount `json:"volumeMounts,omitempty"` + RestartNonce string `json:"restartNonce"` + DeleteMode DeleteMode `json:"deleteMode"` +} + +type FlinkConfig map[string]interface{} + +// Workaround for https://github.com/kubernetes-sigs/kubebuilder/issues/528 +func (in *FlinkConfig) DeepCopyInto(out *FlinkConfig) { + if in == nil { + *out = nil + } else { + *out = make(map[string]interface{}, len(*in)) + for k, v := range *in { + (*out)[k] = deepCopyJSONValue(v) + } + } +} + +func deepCopyJSONValue(x interface{}) interface{} { + switch x := x.(type) { + case map[string]interface{}: + clone := make(map[string]interface{}, len(x)) + for k, v := range x { + clone[k] = deepCopyJSONValue(v) + } + return clone + case []interface{}: + clone := make([]interface{}, len(x)) + for i, v := range x { + clone[i] = deepCopyJSONValue(v) + } + return clone + case string, int, uint, int32, uint32, int64, uint64, bool, float32, float64, nil: + return x + default: + panic(fmt.Errorf("cannot deep copy %T", x)) + } +} + +func (in *FlinkConfig) DeepCopy() *FlinkConfig { + if in == nil { + return nil + } + out := new(FlinkConfig) + in.DeepCopyInto(out) + return out +} + +type JobManagerConfig struct { + Resources *apiv1.ResourceRequirements `json:"resources,omitempty"` + Environment EnvironmentConfig `json:"envConfig"` + Replicas *int32 `json:"replicas,omitempty"` + OffHeapMemoryFraction *float64 `json:"offHeapMemoryFraction,omitempty"` +} + +type TaskManagerConfig struct { + Resources *apiv1.ResourceRequirements `json:"resources,omitempty"` + Environment EnvironmentConfig `json:"envConfig"` + TaskSlots *int32 `json:"taskSlots,omitempty"` + OffHeapMemoryFraction *float64 `json:"offHeapMemoryFraction,omitempty"` +} + +type EnvironmentConfig struct { + EnvFrom []apiv1.EnvFromSource `json:"envFrom,omitempty"` + Env []apiv1.EnvVar `json:"env,omitempty"` +} + +type SavepointInfo struct { + SavepointLocation string `json:"savepointLocation,omitempty"` + TriggerID string `json:"triggerId,omitempty"` +} + +type FlinkClusterStatus struct { + Health HealthStatus `json:"health,omitempty"` + NumberOfTaskManagers int32 `json:"numberOfTaskManagers,omitempty"` + HealthyTaskManagers int32 `json:"healthyTaskManagers,omitepty"` + NumberOfTaskSlots int32 `json:"numberOfTaskSlots,omitempty"` + AvailableTaskSlots int32 `json:"availableTaskSlots"` +} + +type FlinkJobStatus struct { + JobID string `json:"jobID,omitEmpty"` + Health HealthStatus `json:"health,omitEmpty"` + State JobState `json:"state,omitEmpty"` + + JarName string `json:"jarName"` + Parallelism int32 `json:"parallelism"` + EntryClass string `json:"entryClass,omitempty"` + ProgramArgs string `json:"programArgs,omitempty"` + + StartTime *metav1.Time `json:"startTime,omitEmpty"` + JobRestartCount int32 `json:"jobRestartCount,omitEmpty"` + CompletedCheckpointCount int32 `json:"completedCheckpointCount,omitEmpty"` + FailedCheckpointCount int32 `json:"failedCheckpointCount,omitEmpty"` + LastCheckpointTime *metav1.Time `json:"lastCheckpointTime,omitEmpty"` + RestorePath string `json:"restorePath,omitEmpty"` + RestoreTime *metav1.Time `json:"restoreTime,omitEmpty"` + LastFailingTime *metav1.Time `json:"lastFailingTime,omitEmpty"` +} + +type FlinkApplicationStatus struct { + Phase FlinkApplicationPhase `json:"phase"` + StartedAt *metav1.Time `json:"startedAt,omitempty"` + LastUpdatedAt *metav1.Time `json:"lastUpdatedAt,omitempty"` + Reason string `json:"reason,omitempty"` + ClusterStatus FlinkClusterStatus `json:"clusterStatus,omitempty"` + JobStatus FlinkJobStatus `json:"jobStatus"` + FailedDeployHash string `json:"failedUpdateHash,omitEmpty"` + DeployHash string `json:"deployHash"` +} + +func (in *FlinkApplicationStatus) GetPhase() FlinkApplicationPhase { + return in.Phase +} + +func (in *FlinkApplicationStatus) UpdatePhase(phase FlinkApplicationPhase, reason string) { + now := metav1.Now() + if in.StartedAt == nil { + in.StartedAt = &now + in.LastUpdatedAt = &now + } + in.Reason = reason + in.Phase = phase +} + +func (in *FlinkApplicationStatus) TouchResource(reason string) { + now := metav1.Now() + in.LastUpdatedAt = &now + in.Reason = reason +} + +type FlinkApplicationPhase string + +func (p FlinkApplicationPhase) VerboseString() string { + phaseName := string(p) + if p == FlinkApplicationNew { + phaseName = "New" + } + return phaseName +} + +// As you add more ApplicationPhase please add it to FlinkApplicationPhases list +const ( + FlinkApplicationNew FlinkApplicationPhase = "" + FlinkApplicationUpdating FlinkApplicationPhase = "Updating" + FlinkApplicationClusterStarting FlinkApplicationPhase = "ClusterStarting" + FlinkApplicationSubmittingJob FlinkApplicationPhase = "SubmittingJob" + FlinkApplicationRunning FlinkApplicationPhase = "Running" + FlinkApplicationSavepointing FlinkApplicationPhase = "Savepointing" + FlinkApplicationDeleting FlinkApplicationPhase = "Deleting" + FlinkApplicationRollingBackJob FlinkApplicationPhase = "RollingBackJob" + FlinkApplicationDeployFailed FlinkApplicationPhase = "DeployFailed" +) + +var FlinkApplicationPhases = []FlinkApplicationPhase{ + FlinkApplicationNew, + FlinkApplicationUpdating, + FlinkApplicationClusterStarting, + FlinkApplicationSubmittingJob, + FlinkApplicationRunning, + FlinkApplicationSavepointing, + FlinkApplicationDeleting, + FlinkApplicationDeployFailed, + FlinkApplicationRollingBackJob, +} + +func IsRunningPhase(phase FlinkApplicationPhase) bool { + return phase == FlinkApplicationRunning || phase == FlinkApplicationDeployFailed +} + +type DeploymentMode string + +const ( + DeploymentModeSingle DeploymentMode = "Single" + DeploymentModeDual DeploymentMode = "Dual" +) + +type DeleteMode string + +const ( + DeleteModeSavepoint DeleteMode = "Savepoint" + DeleteModeForceCancel DeleteMode = "ForceCancel" + DeleteModeNone DeleteMode = "None" +) + +type HealthStatus string + +const ( + Green HealthStatus = "Green" + Yellow HealthStatus = "Yellow" + Red HealthStatus = "Red" +) + +type JobState string + +const ( + Created JobState = "CREATED" + Running JobState = "RUNNING" + Failing JobState = "FAILING" + Failed JobState = "FAILED" + Cancelling JobState = "CANCELLING" + Canceled JobState = "CANCELED" + Finished JobState = "FINISHED" + Restarting JobState = "RESTARTING" + Suspended JobState = "SUSPENDED" + Reconciling JobState = "RECONCILING" +) diff --git a/pkg/apis/app/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/app/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 00000000..bbc0f58f --- /dev/null +++ b/pkg/apis/app/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,319 @@ +// +build !ignore_autogenerated + +// Code generated by deepcopy-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + v1 "k8s.io/api/core/v1" + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvironmentConfig) DeepCopyInto(out *EnvironmentConfig) { + *out = *in + if in.EnvFrom != nil { + in, out := &in.EnvFrom, &out.EnvFrom + *out = make([]v1.EnvFromSource, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.Env != nil { + in, out := &in.Env, &out.Env + *out = make([]v1.EnvVar, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvironmentConfig. +func (in *EnvironmentConfig) DeepCopy() *EnvironmentConfig { + if in == nil { + return nil + } + out := new(EnvironmentConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FlinkApplication) DeepCopyInto(out *FlinkApplication) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlinkApplication. +func (in *FlinkApplication) DeepCopy() *FlinkApplication { + if in == nil { + return nil + } + out := new(FlinkApplication) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *FlinkApplication) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FlinkApplicationList) DeepCopyInto(out *FlinkApplicationList) { + *out = *in + out.TypeMeta = in.TypeMeta + out.ListMeta = in.ListMeta + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]FlinkApplication, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlinkApplicationList. +func (in *FlinkApplicationList) DeepCopy() *FlinkApplicationList { + if in == nil { + return nil + } + out := new(FlinkApplicationList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *FlinkApplicationList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FlinkApplicationSpec) DeepCopyInto(out *FlinkApplicationSpec) { + *out = *in + if in.ImagePullSecrets != nil { + in, out := &in.ImagePullSecrets, &out.ImagePullSecrets + *out = make([]v1.LocalObjectReference, len(*in)) + copy(*out, *in) + } + in.FlinkConfig.DeepCopyInto(&out.FlinkConfig) + in.TaskManagerConfig.DeepCopyInto(&out.TaskManagerConfig) + in.JobManagerConfig.DeepCopyInto(&out.JobManagerConfig) + out.SavepointInfo = in.SavepointInfo + if in.RPCPort != nil { + in, out := &in.RPCPort, &out.RPCPort + *out = new(int32) + **out = **in + } + if in.BlobPort != nil { + in, out := &in.BlobPort, &out.BlobPort + *out = new(int32) + **out = **in + } + if in.QueryPort != nil { + in, out := &in.QueryPort, &out.QueryPort + *out = new(int32) + **out = **in + } + if in.UIPort != nil { + in, out := &in.UIPort, &out.UIPort + *out = new(int32) + **out = **in + } + if in.MetricsQueryPort != nil { + in, out := &in.MetricsQueryPort, &out.MetricsQueryPort + *out = new(int32) + **out = **in + } + if in.Volumes != nil { + in, out := &in.Volumes, &out.Volumes + *out = make([]v1.Volume, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.VolumeMounts != nil { + in, out := &in.VolumeMounts, &out.VolumeMounts + *out = make([]v1.VolumeMount, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlinkApplicationSpec. +func (in *FlinkApplicationSpec) DeepCopy() *FlinkApplicationSpec { + if in == nil { + return nil + } + out := new(FlinkApplicationSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FlinkApplicationStatus) DeepCopyInto(out *FlinkApplicationStatus) { + *out = *in + if in.StartedAt != nil { + in, out := &in.StartedAt, &out.StartedAt + *out = (*in).DeepCopy() + } + if in.LastUpdatedAt != nil { + in, out := &in.LastUpdatedAt, &out.LastUpdatedAt + *out = (*in).DeepCopy() + } + out.ClusterStatus = in.ClusterStatus + in.JobStatus.DeepCopyInto(&out.JobStatus) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlinkApplicationStatus. +func (in *FlinkApplicationStatus) DeepCopy() *FlinkApplicationStatus { + if in == nil { + return nil + } + out := new(FlinkApplicationStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FlinkClusterStatus) DeepCopyInto(out *FlinkClusterStatus) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlinkClusterStatus. +func (in *FlinkClusterStatus) DeepCopy() *FlinkClusterStatus { + if in == nil { + return nil + } + out := new(FlinkClusterStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FlinkJobStatus) DeepCopyInto(out *FlinkJobStatus) { + *out = *in + if in.StartTime != nil { + in, out := &in.StartTime, &out.StartTime + *out = (*in).DeepCopy() + } + if in.LastCheckpointTime != nil { + in, out := &in.LastCheckpointTime, &out.LastCheckpointTime + *out = (*in).DeepCopy() + } + if in.RestoreTime != nil { + in, out := &in.RestoreTime, &out.RestoreTime + *out = (*in).DeepCopy() + } + if in.LastFailingTime != nil { + in, out := &in.LastFailingTime, &out.LastFailingTime + *out = (*in).DeepCopy() + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlinkJobStatus. +func (in *FlinkJobStatus) DeepCopy() *FlinkJobStatus { + if in == nil { + return nil + } + out := new(FlinkJobStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *JobManagerConfig) DeepCopyInto(out *JobManagerConfig) { + *out = *in + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = new(v1.ResourceRequirements) + (*in).DeepCopyInto(*out) + } + in.Environment.DeepCopyInto(&out.Environment) + if in.Replicas != nil { + in, out := &in.Replicas, &out.Replicas + *out = new(int32) + **out = **in + } + if in.OffHeapMemoryFraction != nil { + in, out := &in.OffHeapMemoryFraction, &out.OffHeapMemoryFraction + *out = new(float64) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new JobManagerConfig. +func (in *JobManagerConfig) DeepCopy() *JobManagerConfig { + if in == nil { + return nil + } + out := new(JobManagerConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SavepointInfo) DeepCopyInto(out *SavepointInfo) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SavepointInfo. +func (in *SavepointInfo) DeepCopy() *SavepointInfo { + if in == nil { + return nil + } + out := new(SavepointInfo) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TaskManagerConfig) DeepCopyInto(out *TaskManagerConfig) { + *out = *in + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = new(v1.ResourceRequirements) + (*in).DeepCopyInto(*out) + } + in.Environment.DeepCopyInto(&out.Environment) + if in.TaskSlots != nil { + in, out := &in.TaskSlots, &out.TaskSlots + *out = new(int32) + **out = **in + } + if in.OffHeapMemoryFraction != nil { + in, out := &in.OffHeapMemoryFraction, &out.OffHeapMemoryFraction + *out = new(float64) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TaskManagerConfig. +func (in *TaskManagerConfig) DeepCopy() *TaskManagerConfig { + if in == nil { + return nil + } + out := new(TaskManagerConfig) + in.DeepCopyInto(out) + return out +} diff --git a/pkg/client/clientset/versioned/clientset.go b/pkg/client/clientset/versioned/clientset.go new file mode 100644 index 00000000..bc1cb21a --- /dev/null +++ b/pkg/client/clientset/versioned/clientset.go @@ -0,0 +1,82 @@ +// Code generated by client-gen. DO NOT EDIT. + +package versioned + +import ( + flinkv1alpha1 "github.com/lyft/flinkk8soperator/pkg/client/clientset/versioned/typed/app/v1alpha1" + discovery "k8s.io/client-go/discovery" + rest "k8s.io/client-go/rest" + flowcontrol "k8s.io/client-go/util/flowcontrol" +) + +type Interface interface { + Discovery() discovery.DiscoveryInterface + FlinkV1alpha1() flinkv1alpha1.FlinkV1alpha1Interface + // Deprecated: please explicitly pick a version if possible. + Flink() flinkv1alpha1.FlinkV1alpha1Interface +} + +// Clientset contains the clients for groups. Each group has exactly one +// version included in a Clientset. +type Clientset struct { + *discovery.DiscoveryClient + flinkV1alpha1 *flinkv1alpha1.FlinkV1alpha1Client +} + +// FlinkV1alpha1 retrieves the FlinkV1alpha1Client +func (c *Clientset) FlinkV1alpha1() flinkv1alpha1.FlinkV1alpha1Interface { + return c.flinkV1alpha1 +} + +// Deprecated: Flink retrieves the default version of FlinkClient. +// Please explicitly pick a version. +func (c *Clientset) Flink() flinkv1alpha1.FlinkV1alpha1Interface { + return c.flinkV1alpha1 +} + +// Discovery retrieves the DiscoveryClient +func (c *Clientset) Discovery() discovery.DiscoveryInterface { + if c == nil { + return nil + } + return c.DiscoveryClient +} + +// NewForConfig creates a new Clientset for the given config. +func NewForConfig(c *rest.Config) (*Clientset, error) { + configShallowCopy := *c + if configShallowCopy.RateLimiter == nil && configShallowCopy.QPS > 0 { + configShallowCopy.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(configShallowCopy.QPS, configShallowCopy.Burst) + } + var cs Clientset + var err error + cs.flinkV1alpha1, err = flinkv1alpha1.NewForConfig(&configShallowCopy) + if err != nil { + return nil, err + } + + cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfig(&configShallowCopy) + if err != nil { + return nil, err + } + return &cs, nil +} + +// NewForConfigOrDie creates a new Clientset for the given config and +// panics if there is an error in the config. +func NewForConfigOrDie(c *rest.Config) *Clientset { + var cs Clientset + cs.flinkV1alpha1 = flinkv1alpha1.NewForConfigOrDie(c) + + cs.DiscoveryClient = discovery.NewDiscoveryClientForConfigOrDie(c) + return &cs +} + +// New creates a new Clientset for the given RESTClient. +func New(c rest.Interface) *Clientset { + var cs Clientset + cs.flinkV1alpha1 = flinkv1alpha1.New(c) + + cs.DiscoveryClient = discovery.NewDiscoveryClient(c) + return &cs +} diff --git a/pkg/client/clientset/versioned/doc.go b/pkg/client/clientset/versioned/doc.go new file mode 100644 index 00000000..0e0c2a89 --- /dev/null +++ b/pkg/client/clientset/versioned/doc.go @@ -0,0 +1,4 @@ +// Code generated by client-gen. DO NOT EDIT. + +// This package has the automatically generated clientset. +package versioned diff --git a/pkg/client/clientset/versioned/fake/clientset_generated.go b/pkg/client/clientset/versioned/fake/clientset_generated.go new file mode 100644 index 00000000..0a31575f --- /dev/null +++ b/pkg/client/clientset/versioned/fake/clientset_generated.go @@ -0,0 +1,66 @@ +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + clientset "github.com/lyft/flinkk8soperator/pkg/client/clientset/versioned" + flinkv1alpha1 "github.com/lyft/flinkk8soperator/pkg/client/clientset/versioned/typed/app/v1alpha1" + fakeflinkv1alpha1 "github.com/lyft/flinkk8soperator/pkg/client/clientset/versioned/typed/app/v1alpha1/fake" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/client-go/discovery" + fakediscovery "k8s.io/client-go/discovery/fake" + "k8s.io/client-go/testing" +) + +// NewSimpleClientset returns a clientset that will respond with the provided objects. +// It's backed by a very simple object tracker that processes creates, updates and deletions as-is, +// without applying any validations and/or defaults. It shouldn't be considered a replacement +// for a real clientset and is mostly useful in simple unit tests. +func NewSimpleClientset(objects ...runtime.Object) *Clientset { + o := testing.NewObjectTracker(scheme, codecs.UniversalDecoder()) + for _, obj := range objects { + if err := o.Add(obj); err != nil { + panic(err) + } + } + + cs := &Clientset{} + cs.discovery = &fakediscovery.FakeDiscovery{Fake: &cs.Fake} + cs.AddReactor("*", "*", testing.ObjectReaction(o)) + cs.AddWatchReactor("*", func(action testing.Action) (handled bool, ret watch.Interface, err error) { + gvr := action.GetResource() + ns := action.GetNamespace() + watch, err := o.Watch(gvr, ns) + if err != nil { + return false, nil, err + } + return true, watch, nil + }) + + return cs +} + +// Clientset implements clientset.Interface. Meant to be embedded into a +// struct to get a default implementation. This makes faking out just the method +// you want to test easier. +type Clientset struct { + testing.Fake + discovery *fakediscovery.FakeDiscovery +} + +func (c *Clientset) Discovery() discovery.DiscoveryInterface { + return c.discovery +} + +var _ clientset.Interface = &Clientset{} + +// FlinkV1alpha1 retrieves the FlinkV1alpha1Client +func (c *Clientset) FlinkV1alpha1() flinkv1alpha1.FlinkV1alpha1Interface { + return &fakeflinkv1alpha1.FakeFlinkV1alpha1{Fake: &c.Fake} +} + +// Flink retrieves the FlinkV1alpha1Client +func (c *Clientset) Flink() flinkv1alpha1.FlinkV1alpha1Interface { + return &fakeflinkv1alpha1.FakeFlinkV1alpha1{Fake: &c.Fake} +} diff --git a/pkg/client/clientset/versioned/fake/doc.go b/pkg/client/clientset/versioned/fake/doc.go new file mode 100644 index 00000000..3630ed1c --- /dev/null +++ b/pkg/client/clientset/versioned/fake/doc.go @@ -0,0 +1,4 @@ +// Code generated by client-gen. DO NOT EDIT. + +// This package has the automatically generated fake clientset. +package fake diff --git a/pkg/client/clientset/versioned/fake/register.go b/pkg/client/clientset/versioned/fake/register.go new file mode 100644 index 00000000..87ae6037 --- /dev/null +++ b/pkg/client/clientset/versioned/fake/register.go @@ -0,0 +1,38 @@ +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + flinkv1alpha1 "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + schema "k8s.io/apimachinery/pkg/runtime/schema" + serializer "k8s.io/apimachinery/pkg/runtime/serializer" +) + +var scheme = runtime.NewScheme() +var codecs = serializer.NewCodecFactory(scheme) +var parameterCodec = runtime.NewParameterCodec(scheme) + +func init() { + v1.AddToGroupVersion(scheme, schema.GroupVersion{Version: "v1"}) + AddToScheme(scheme) +} + +// AddToScheme adds all types of this clientset into the given scheme. This allows composition +// of clientsets, like in: +// +// import ( +// "k8s.io/client-go/kubernetes" +// clientsetscheme "k8s.io/client-go/kubernetes/scheme" +// aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme" +// ) +// +// kclientset, _ := kubernetes.NewForConfig(c) +// aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme) +// +// After this, RawExtensions in Kubernetes types will serialize kube-aggregator types +// correctly. +func AddToScheme(scheme *runtime.Scheme) { + flinkv1alpha1.AddToScheme(scheme) +} diff --git a/pkg/client/clientset/versioned/scheme/doc.go b/pkg/client/clientset/versioned/scheme/doc.go new file mode 100644 index 00000000..14db57a5 --- /dev/null +++ b/pkg/client/clientset/versioned/scheme/doc.go @@ -0,0 +1,4 @@ +// Code generated by client-gen. DO NOT EDIT. + +// This package contains the scheme of the automatically generated clientset. +package scheme diff --git a/pkg/client/clientset/versioned/scheme/register.go b/pkg/client/clientset/versioned/scheme/register.go new file mode 100644 index 00000000..e5a17b00 --- /dev/null +++ b/pkg/client/clientset/versioned/scheme/register.go @@ -0,0 +1,38 @@ +// Code generated by client-gen. DO NOT EDIT. + +package scheme + +import ( + flinkv1alpha1 "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + schema "k8s.io/apimachinery/pkg/runtime/schema" + serializer "k8s.io/apimachinery/pkg/runtime/serializer" +) + +var Scheme = runtime.NewScheme() +var Codecs = serializer.NewCodecFactory(Scheme) +var ParameterCodec = runtime.NewParameterCodec(Scheme) + +func init() { + v1.AddToGroupVersion(Scheme, schema.GroupVersion{Version: "v1"}) + AddToScheme(Scheme) +} + +// AddToScheme adds all types of this clientset into the given scheme. This allows composition +// of clientsets, like in: +// +// import ( +// "k8s.io/client-go/kubernetes" +// clientsetscheme "k8s.io/client-go/kubernetes/scheme" +// aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme" +// ) +// +// kclientset, _ := kubernetes.NewForConfig(c) +// aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme) +// +// After this, RawExtensions in Kubernetes types will serialize kube-aggregator types +// correctly. +func AddToScheme(scheme *runtime.Scheme) { + flinkv1alpha1.AddToScheme(scheme) +} diff --git a/pkg/client/clientset/versioned/typed/app/v1alpha1/app_client.go b/pkg/client/clientset/versioned/typed/app/v1alpha1/app_client.go new file mode 100644 index 00000000..0b5e3fff --- /dev/null +++ b/pkg/client/clientset/versioned/typed/app/v1alpha1/app_client.go @@ -0,0 +1,74 @@ +// Code generated by client-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + v1alpha1 "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "github.com/lyft/flinkk8soperator/pkg/client/clientset/versioned/scheme" + serializer "k8s.io/apimachinery/pkg/runtime/serializer" + rest "k8s.io/client-go/rest" +) + +type FlinkV1alpha1Interface interface { + RESTClient() rest.Interface + FlinkApplicationsGetter +} + +// FlinkV1alpha1Client is used to interact with features provided by the flink.k8s.io group. +type FlinkV1alpha1Client struct { + restClient rest.Interface +} + +func (c *FlinkV1alpha1Client) FlinkApplications(namespace string) FlinkApplicationInterface { + return newFlinkApplications(c, namespace) +} + +// NewForConfig creates a new FlinkV1alpha1Client for the given config. +func NewForConfig(c *rest.Config) (*FlinkV1alpha1Client, error) { + config := *c + if err := setConfigDefaults(&config); err != nil { + return nil, err + } + client, err := rest.RESTClientFor(&config) + if err != nil { + return nil, err + } + return &FlinkV1alpha1Client{client}, nil +} + +// NewForConfigOrDie creates a new FlinkV1alpha1Client for the given config and +// panics if there is an error in the config. +func NewForConfigOrDie(c *rest.Config) *FlinkV1alpha1Client { + client, err := NewForConfig(c) + if err != nil { + panic(err) + } + return client +} + +// New creates a new FlinkV1alpha1Client for the given RESTClient. +func New(c rest.Interface) *FlinkV1alpha1Client { + return &FlinkV1alpha1Client{c} +} + +func setConfigDefaults(config *rest.Config) error { + gv := v1alpha1.SchemeGroupVersion + config.GroupVersion = &gv + config.APIPath = "/apis" + config.NegotiatedSerializer = serializer.DirectCodecFactory{CodecFactory: scheme.Codecs} + + if config.UserAgent == "" { + config.UserAgent = rest.DefaultKubernetesUserAgent() + } + + return nil +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *FlinkV1alpha1Client) RESTClient() rest.Interface { + if c == nil { + return nil + } + return c.restClient +} diff --git a/pkg/client/clientset/versioned/typed/app/v1alpha1/doc.go b/pkg/client/clientset/versioned/typed/app/v1alpha1/doc.go new file mode 100644 index 00000000..93a7ca4e --- /dev/null +++ b/pkg/client/clientset/versioned/typed/app/v1alpha1/doc.go @@ -0,0 +1,4 @@ +// Code generated by client-gen. DO NOT EDIT. + +// This package has the automatically generated typed clients. +package v1alpha1 diff --git a/pkg/client/clientset/versioned/typed/app/v1alpha1/fake/doc.go b/pkg/client/clientset/versioned/typed/app/v1alpha1/fake/doc.go new file mode 100644 index 00000000..2b5ba4c8 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/app/v1alpha1/fake/doc.go @@ -0,0 +1,4 @@ +// Code generated by client-gen. DO NOT EDIT. + +// Package fake has the automatically generated clients. +package fake diff --git a/pkg/client/clientset/versioned/typed/app/v1alpha1/fake/fake_app_client.go b/pkg/client/clientset/versioned/typed/app/v1alpha1/fake/fake_app_client.go new file mode 100644 index 00000000..017abeb5 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/app/v1alpha1/fake/fake_app_client.go @@ -0,0 +1,24 @@ +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + v1alpha1 "github.com/lyft/flinkk8soperator/pkg/client/clientset/versioned/typed/app/v1alpha1" + rest "k8s.io/client-go/rest" + testing "k8s.io/client-go/testing" +) + +type FakeFlinkV1alpha1 struct { + *testing.Fake +} + +func (c *FakeFlinkV1alpha1) FlinkApplications(namespace string) v1alpha1.FlinkApplicationInterface { + return &FakeFlinkApplications{c, namespace} +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *FakeFlinkV1alpha1) RESTClient() rest.Interface { + var ret *rest.RESTClient + return ret +} diff --git a/pkg/client/clientset/versioned/typed/app/v1alpha1/fake/fake_flinkapplication.go b/pkg/client/clientset/versioned/typed/app/v1alpha1/fake/fake_flinkapplication.go new file mode 100644 index 00000000..f1d1b4d5 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/app/v1alpha1/fake/fake_flinkapplication.go @@ -0,0 +1,112 @@ +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + v1alpha1 "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + labels "k8s.io/apimachinery/pkg/labels" + schema "k8s.io/apimachinery/pkg/runtime/schema" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + testing "k8s.io/client-go/testing" +) + +// FakeFlinkApplications implements FlinkApplicationInterface +type FakeFlinkApplications struct { + Fake *FakeFlinkV1alpha1 + ns string +} + +var flinkapplicationsResource = schema.GroupVersionResource{Group: "flink.k8s.io", Version: "v1alpha1", Resource: "flinkapplications"} + +var flinkapplicationsKind = schema.GroupVersionKind{Group: "flink.k8s.io", Version: "v1alpha1", Kind: "FlinkApplication"} + +// Get takes name of the flinkApplication, and returns the corresponding flinkApplication object, and an error if there is any. +func (c *FakeFlinkApplications) Get(name string, options v1.GetOptions) (result *v1alpha1.FlinkApplication, err error) { + obj, err := c.Fake. + Invokes(testing.NewGetAction(flinkapplicationsResource, c.ns, name), &v1alpha1.FlinkApplication{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.FlinkApplication), err +} + +// List takes label and field selectors, and returns the list of FlinkApplications that match those selectors. +func (c *FakeFlinkApplications) List(opts v1.ListOptions) (result *v1alpha1.FlinkApplicationList, err error) { + obj, err := c.Fake. + Invokes(testing.NewListAction(flinkapplicationsResource, flinkapplicationsKind, c.ns, opts), &v1alpha1.FlinkApplicationList{}) + + if obj == nil { + return nil, err + } + + label, _, _ := testing.ExtractFromListOptions(opts) + if label == nil { + label = labels.Everything() + } + list := &v1alpha1.FlinkApplicationList{ListMeta: obj.(*v1alpha1.FlinkApplicationList).ListMeta} + for _, item := range obj.(*v1alpha1.FlinkApplicationList).Items { + if label.Matches(labels.Set(item.Labels)) { + list.Items = append(list.Items, item) + } + } + return list, err +} + +// Watch returns a watch.Interface that watches the requested flinkApplications. +func (c *FakeFlinkApplications) Watch(opts v1.ListOptions) (watch.Interface, error) { + return c.Fake. + InvokesWatch(testing.NewWatchAction(flinkapplicationsResource, c.ns, opts)) + +} + +// Create takes the representation of a flinkApplication and creates it. Returns the server's representation of the flinkApplication, and an error, if there is any. +func (c *FakeFlinkApplications) Create(flinkApplication *v1alpha1.FlinkApplication) (result *v1alpha1.FlinkApplication, err error) { + obj, err := c.Fake. + Invokes(testing.NewCreateAction(flinkapplicationsResource, c.ns, flinkApplication), &v1alpha1.FlinkApplication{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.FlinkApplication), err +} + +// Update takes the representation of a flinkApplication and updates it. Returns the server's representation of the flinkApplication, and an error, if there is any. +func (c *FakeFlinkApplications) Update(flinkApplication *v1alpha1.FlinkApplication) (result *v1alpha1.FlinkApplication, err error) { + obj, err := c.Fake. + Invokes(testing.NewUpdateAction(flinkapplicationsResource, c.ns, flinkApplication), &v1alpha1.FlinkApplication{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.FlinkApplication), err +} + +// Delete takes name of the flinkApplication and deletes it. Returns an error if one occurs. +func (c *FakeFlinkApplications) Delete(name string, options *v1.DeleteOptions) error { + _, err := c.Fake. + Invokes(testing.NewDeleteAction(flinkapplicationsResource, c.ns, name), &v1alpha1.FlinkApplication{}) + + return err +} + +// DeleteCollection deletes a collection of objects. +func (c *FakeFlinkApplications) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { + action := testing.NewDeleteCollectionAction(flinkapplicationsResource, c.ns, listOptions) + + _, err := c.Fake.Invokes(action, &v1alpha1.FlinkApplicationList{}) + return err +} + +// Patch applies the patch and returns the patched flinkApplication. +func (c *FakeFlinkApplications) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.FlinkApplication, err error) { + obj, err := c.Fake. + Invokes(testing.NewPatchSubresourceAction(flinkapplicationsResource, c.ns, name, data, subresources...), &v1alpha1.FlinkApplication{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.FlinkApplication), err +} diff --git a/pkg/client/clientset/versioned/typed/app/v1alpha1/flinkapplication.go b/pkg/client/clientset/versioned/typed/app/v1alpha1/flinkapplication.go new file mode 100644 index 00000000..b80d6ca9 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/app/v1alpha1/flinkapplication.go @@ -0,0 +1,141 @@ +// Code generated by client-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + v1alpha1 "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + scheme "github.com/lyft/flinkk8soperator/pkg/client/clientset/versioned/scheme" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + rest "k8s.io/client-go/rest" +) + +// FlinkApplicationsGetter has a method to return a FlinkApplicationInterface. +// A group's client should implement this interface. +type FlinkApplicationsGetter interface { + FlinkApplications(namespace string) FlinkApplicationInterface +} + +// FlinkApplicationInterface has methods to work with FlinkApplication resources. +type FlinkApplicationInterface interface { + Create(*v1alpha1.FlinkApplication) (*v1alpha1.FlinkApplication, error) + Update(*v1alpha1.FlinkApplication) (*v1alpha1.FlinkApplication, error) + Delete(name string, options *v1.DeleteOptions) error + DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error + Get(name string, options v1.GetOptions) (*v1alpha1.FlinkApplication, error) + List(opts v1.ListOptions) (*v1alpha1.FlinkApplicationList, error) + Watch(opts v1.ListOptions) (watch.Interface, error) + Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.FlinkApplication, err error) + FlinkApplicationExpansion +} + +// flinkApplications implements FlinkApplicationInterface +type flinkApplications struct { + client rest.Interface + ns string +} + +// newFlinkApplications returns a FlinkApplications +func newFlinkApplications(c *FlinkV1alpha1Client, namespace string) *flinkApplications { + return &flinkApplications{ + client: c.RESTClient(), + ns: namespace, + } +} + +// Get takes name of the flinkApplication, and returns the corresponding flinkApplication object, and an error if there is any. +func (c *flinkApplications) Get(name string, options v1.GetOptions) (result *v1alpha1.FlinkApplication, err error) { + result = &v1alpha1.FlinkApplication{} + err = c.client.Get(). + Namespace(c.ns). + Resource("flinkapplications"). + Name(name). + VersionedParams(&options, scheme.ParameterCodec). + Do(). + Into(result) + return +} + +// List takes label and field selectors, and returns the list of FlinkApplications that match those selectors. +func (c *flinkApplications) List(opts v1.ListOptions) (result *v1alpha1.FlinkApplicationList, err error) { + result = &v1alpha1.FlinkApplicationList{} + err = c.client.Get(). + Namespace(c.ns). + Resource("flinkapplications"). + VersionedParams(&opts, scheme.ParameterCodec). + Do(). + Into(result) + return +} + +// Watch returns a watch.Interface that watches the requested flinkApplications. +func (c *flinkApplications) Watch(opts v1.ListOptions) (watch.Interface, error) { + opts.Watch = true + return c.client.Get(). + Namespace(c.ns). + Resource("flinkapplications"). + VersionedParams(&opts, scheme.ParameterCodec). + Watch() +} + +// Create takes the representation of a flinkApplication and creates it. Returns the server's representation of the flinkApplication, and an error, if there is any. +func (c *flinkApplications) Create(flinkApplication *v1alpha1.FlinkApplication) (result *v1alpha1.FlinkApplication, err error) { + result = &v1alpha1.FlinkApplication{} + err = c.client.Post(). + Namespace(c.ns). + Resource("flinkapplications"). + Body(flinkApplication). + Do(). + Into(result) + return +} + +// Update takes the representation of a flinkApplication and updates it. Returns the server's representation of the flinkApplication, and an error, if there is any. +func (c *flinkApplications) Update(flinkApplication *v1alpha1.FlinkApplication) (result *v1alpha1.FlinkApplication, err error) { + result = &v1alpha1.FlinkApplication{} + err = c.client.Put(). + Namespace(c.ns). + Resource("flinkapplications"). + Name(flinkApplication.Name). + Body(flinkApplication). + Do(). + Into(result) + return +} + +// Delete takes name of the flinkApplication and deletes it. Returns an error if one occurs. +func (c *flinkApplications) Delete(name string, options *v1.DeleteOptions) error { + return c.client.Delete(). + Namespace(c.ns). + Resource("flinkapplications"). + Name(name). + Body(options). + Do(). + Error() +} + +// DeleteCollection deletes a collection of objects. +func (c *flinkApplications) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { + return c.client.Delete(). + Namespace(c.ns). + Resource("flinkapplications"). + VersionedParams(&listOptions, scheme.ParameterCodec). + Body(options). + Do(). + Error() +} + +// Patch applies the patch and returns the patched flinkApplication. +func (c *flinkApplications) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.FlinkApplication, err error) { + result = &v1alpha1.FlinkApplication{} + err = c.client.Patch(pt). + Namespace(c.ns). + Resource("flinkapplications"). + SubResource(subresources...). + Name(name). + Body(data). + Do(). + Into(result) + return +} diff --git a/pkg/client/clientset/versioned/typed/app/v1alpha1/generated_expansion.go b/pkg/client/clientset/versioned/typed/app/v1alpha1/generated_expansion.go new file mode 100644 index 00000000..ab955a79 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/app/v1alpha1/generated_expansion.go @@ -0,0 +1,5 @@ +// Code generated by client-gen. DO NOT EDIT. + +package v1alpha1 + +type FlinkApplicationExpansion interface{} diff --git a/pkg/controller/add_flinkapplication.go b/pkg/controller/add_flinkapplication.go new file mode 100644 index 00000000..c7b54464 --- /dev/null +++ b/pkg/controller/add_flinkapplication.go @@ -0,0 +1,10 @@ +package controller + +import ( + "github.com/lyft/flinkk8soperator/pkg/controller/flinkapplication" +) + +func init() { + // AddToManagerFuncs is a list of functions to create controllers and add them to a manager. + AddToManagerFuncs = append(AddToManagerFuncs, flinkapplication.Add) +} diff --git a/pkg/controller/common/utils.go b/pkg/controller/common/utils.go new file mode 100644 index 00000000..97ddd9c7 --- /dev/null +++ b/pkg/controller/common/utils.go @@ -0,0 +1,58 @@ +package common + +import ( + "github.com/lyft/flytestdlib/contextutils" + + appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" +) + +func GetValidLabelNames() []contextutils.Key { + return []contextutils.Key{ + contextutils.AppNameKey, + contextutils.NamespaceKey, + } +} + +func DuplicateMap(o map[string]string) (r map[string]string) { + if o == nil { + return map[string]string{} + } + r = make(map[string]string, len(o)) + for k, v := range o { + r[k] = v + } + return +} + +func CopyMap(to map[string]string, from map[string]string) map[string]string { + if len(to) == 0 && len(from) == 0 { + return to + } + if len(from) == 0 { + return to + } + if len(to) == 0 { + to = make(map[string]string, len(from)) + } + for k, v := range from { + to[k] = v + } + return to +} + +func GetEnvVar(envs []v1.EnvVar, name string) *v1.EnvVar { + for _, v := range envs { + if v.Name == name { + return &v + } + } + + return nil +} + +type FlinkDeployment struct { + Jobmanager *appsv1.Deployment + Taskmanager *appsv1.Deployment + Hash string +} diff --git a/pkg/controller/config/config.go b/pkg/controller/config/config.go new file mode 100644 index 00000000..f64f579f --- /dev/null +++ b/pkg/controller/config/config.go @@ -0,0 +1,32 @@ +package config + +import ( + "github.com/lyft/flytestdlib/config" +) + +//go:generate pflags Config + +const configSectionKey = "operator" + +var ConfigSection = config.MustRegisterSection(configSectionKey, &Config{}) + +type Config struct { + ResyncPeriod config.Duration `json:"resyncPeriod" pflag:"\"30s\",Determines the resync period for all watchers."` + LimitNamespace string `json:"limitNamespace" pflag:"\"\",Namespaces to watch for by flink operator"` + MetricsPrefix string `json:"metricsPrefix" pflag:"\"flinkk8soperator\",Prefix for metrics propagated to prometheus"` + ProfilerPort config.Port `json:"prof-port" pflag:"\"10254\",Profiler port"` + FlinkIngressURLFormat string `json:"ingressUrlFormat"` + UseProxy bool `json:"useKubectlProxy"` + ProxyPort config.Port `json:"ProxyPort" pflag:"\"8001\",The port at which flink cluster runs locally"` + ContainerNameFormat string `json:"containerNameFormat"` + Workers int `json:"workers" pflag:"4,Number of routines to process custom resource"` + StatemachineStalenessDuration config.Duration `json:"statemachineStalenessDuration" pflag:"\"5m\",Duration for statemachine staleness."` +} + +func GetConfig() *Config { + return ConfigSection.GetConfig().(*Config) +} + +func SetConfig(c *Config) error { + return ConfigSection.SetConfig(c) +} diff --git a/pkg/controller/config/config_flags.go b/pkg/controller/config/config_flags.go new file mode 100755 index 00000000..97882f4b --- /dev/null +++ b/pkg/controller/config/config_flags.go @@ -0,0 +1,27 @@ +// Code generated by go generate; DO NOT EDIT. +// This file was generated by robots. + +package config + +import ( + "fmt" + + "github.com/spf13/pflag" +) + +// GetPFlagSet will return strongly types pflags for all fields in Config and its nested types. The format of the +// flags is json-name.json-sub-name... etc. +func (Config) GetPFlagSet(prefix string) *pflag.FlagSet { + cmdFlags := pflag.NewFlagSet("Config", pflag.ExitOnError) + cmdFlags.String(fmt.Sprintf("%v%v", prefix, "resyncPeriod"), "30s", "Determines the resync period for all watchers.") + cmdFlags.String(fmt.Sprintf("%v%v", prefix, "limitNamespace"), "", "Namespaces to watch for by flink operator") + cmdFlags.String(fmt.Sprintf("%v%v", prefix, "metricsPrefix"), "flinkk8soperator", "Prefix for metrics propagated to prometheus") + cmdFlags.String(fmt.Sprintf("%v%v", prefix, "prof-port"), "10254", "Profiler port") + cmdFlags.String(fmt.Sprintf("%v%v", prefix, "ingressUrlFormat"), *new(string), "") + cmdFlags.Bool(fmt.Sprintf("%v%v", prefix, "useKubectlProxy"), *new(bool), "") + cmdFlags.String(fmt.Sprintf("%v%v", prefix, "ProxyPort"), "8001", "The port at which flink cluster runs locally") + cmdFlags.String(fmt.Sprintf("%v%v", prefix, "containerNameFormat"), *new(string), "") + cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "workers"), 4, "Number of routines to process custom resource") + cmdFlags.String(fmt.Sprintf("%v%v", prefix, "statemachineStalenessDuration"), "5m", "Duration for statemachine staleness.") + return cmdFlags +} diff --git a/pkg/controller/config/config_flags_test.go b/pkg/controller/config/config_flags_test.go new file mode 100755 index 00000000..3123b99a --- /dev/null +++ b/pkg/controller/config/config_flags_test.go @@ -0,0 +1,322 @@ +// Code generated by go generate; DO NOT EDIT. +// This file was generated by robots. + +package config + +import ( + "encoding/json" + "fmt" + "reflect" + "strings" + "testing" + + "github.com/mitchellh/mapstructure" + "github.com/stretchr/testify/assert" +) + +var dereferencableKindsConfig = map[reflect.Kind]struct{}{ + reflect.Array: {}, reflect.Chan: {}, reflect.Map: {}, reflect.Ptr: {}, reflect.Slice: {}, +} + +// Checks if t is a kind that can be dereferenced to get its underlying type. +func canGetElementConfig(t reflect.Kind) bool { + _, exists := dereferencableKindsConfig[t] + return exists +} + +// This decoder hook tests types for json unmarshaling capability. If implemented, it uses json unmarshal to build the +// object. Otherwise, it'll just pass on the original data. +func jsonUnmarshalerHookConfig(_, to reflect.Type, data interface{}) (interface{}, error) { + unmarshalerType := reflect.TypeOf((*json.Unmarshaler)(nil)).Elem() + if to.Implements(unmarshalerType) || reflect.PtrTo(to).Implements(unmarshalerType) || + (canGetElementConfig(to.Kind()) && to.Elem().Implements(unmarshalerType)) { + + raw, err := json.Marshal(data) + if err != nil { + fmt.Printf("Failed to marshal Data: %v. Error: %v. Skipping jsonUnmarshalHook", data, err) + return data, nil + } + + res := reflect.New(to).Interface() + err = json.Unmarshal(raw, &res) + if err != nil { + fmt.Printf("Failed to umarshal Data: %v. Error: %v. Skipping jsonUnmarshalHook", data, err) + return data, nil + } + + return res, nil + } + + return data, nil +} + +func decode_Config(input, result interface{}) error { + config := &mapstructure.DecoderConfig{ + TagName: "json", + WeaklyTypedInput: true, + Result: result, + DecodeHook: mapstructure.ComposeDecodeHookFunc( + mapstructure.StringToTimeDurationHookFunc(), + mapstructure.StringToSliceHookFunc(","), + jsonUnmarshalerHookConfig, + ), + } + + decoder, err := mapstructure.NewDecoder(config) + if err != nil { + return err + } + + return decoder.Decode(input) +} + +func join_Config(arr interface{}, sep string) string { + listValue := reflect.ValueOf(arr) + strs := make([]string, 0, listValue.Len()) + for i := 0; i < listValue.Len(); i++ { + strs = append(strs, fmt.Sprintf("%v", listValue.Index(i))) + } + + return strings.Join(strs, sep) +} + +func testDecodeJson_Config(t *testing.T, val, result interface{}) { + assert.NoError(t, decode_Config(val, result)) +} + +func testDecodeSlice_Config(t *testing.T, vStringSlice, result interface{}) { + assert.NoError(t, decode_Config(vStringSlice, result)) +} + +func TestConfig_GetPFlagSet(t *testing.T) { + val := Config{} + cmdFlags := val.GetPFlagSet("") + assert.True(t, cmdFlags.HasFlags()) +} + +func TestConfig_SetFlags(t *testing.T) { + actual := Config{} + cmdFlags := actual.GetPFlagSet("") + assert.True(t, cmdFlags.HasFlags()) + + t.Run("Test_resyncPeriod", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vString, err := cmdFlags.GetString("resyncPeriod"); err == nil { + assert.Equal(t, string("30s"), vString) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "30s" + + cmdFlags.Set("resyncPeriod", testValue) + if vString, err := cmdFlags.GetString("resyncPeriod"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vString), &actual.ResyncPeriod) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) + t.Run("Test_limitNamespace", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vString, err := cmdFlags.GetString("limitNamespace"); err == nil { + assert.Equal(t, string(""), vString) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "1" + + cmdFlags.Set("limitNamespace", testValue) + if vString, err := cmdFlags.GetString("limitNamespace"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vString), &actual.LimitNamespace) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) + t.Run("Test_metricsPrefix", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vString, err := cmdFlags.GetString("metricsPrefix"); err == nil { + assert.Equal(t, string("flinkk8soperator"), vString) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "1" + + cmdFlags.Set("metricsPrefix", testValue) + if vString, err := cmdFlags.GetString("metricsPrefix"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vString), &actual.MetricsPrefix) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) + t.Run("Test_prof-port", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vString, err := cmdFlags.GetString("prof-port"); err == nil { + assert.Equal(t, string("10254"), vString) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "10254" + + cmdFlags.Set("prof-port", testValue) + if vString, err := cmdFlags.GetString("prof-port"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vString), &actual.ProfilerPort) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) + t.Run("Test_ingressUrlFormat", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vString, err := cmdFlags.GetString("ingressUrlFormat"); err == nil { + assert.Equal(t, string(*new(string)), vString) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "1" + + cmdFlags.Set("ingressUrlFormat", testValue) + if vString, err := cmdFlags.GetString("ingressUrlFormat"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vString), &actual.FlinkIngressURLFormat) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) + t.Run("Test_useKubectlProxy", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vBool, err := cmdFlags.GetBool("useKubectlProxy"); err == nil { + assert.Equal(t, bool(*new(bool)), vBool) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "1" + + cmdFlags.Set("useKubectlProxy", testValue) + if vBool, err := cmdFlags.GetBool("useKubectlProxy"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vBool), &actual.UseProxy) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) + t.Run("Test_ProxyPort", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vString, err := cmdFlags.GetString("ProxyPort"); err == nil { + assert.Equal(t, string("8001"), vString) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "8001" + + cmdFlags.Set("ProxyPort", testValue) + if vString, err := cmdFlags.GetString("ProxyPort"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vString), &actual.ProxyPort) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) + t.Run("Test_containerNameFormat", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vString, err := cmdFlags.GetString("containerNameFormat"); err == nil { + assert.Equal(t, string(*new(string)), vString) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "1" + + cmdFlags.Set("containerNameFormat", testValue) + if vString, err := cmdFlags.GetString("containerNameFormat"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vString), &actual.ContainerNameFormat) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) + t.Run("Test_workers", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vInt, err := cmdFlags.GetInt("workers"); err == nil { + assert.Equal(t, int(4), vInt) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "1" + + cmdFlags.Set("workers", testValue) + if vInt, err := cmdFlags.GetInt("workers"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vInt), &actual.Workers) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) + t.Run("Test_statemachineStalenessDuration", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vString, err := cmdFlags.GetString("statemachineStalenessDuration"); err == nil { + assert.Equal(t, string("5m"), vString) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "5m" + + cmdFlags.Set("statemachineStalenessDuration", testValue) + if vString, err := cmdFlags.GetString("statemachineStalenessDuration"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vString), &actual.StatemachineStalenessDuration) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) +} diff --git a/pkg/controller/config/runtime_config.go b/pkg/controller/config/runtime_config.go new file mode 100644 index 00000000..b855b9dc --- /dev/null +++ b/pkg/controller/config/runtime_config.go @@ -0,0 +1,9 @@ +package config + +import ( + "github.com/lyft/flytestdlib/promutils" +) + +type RuntimeConfig struct { + MetricsScope promutils.Scope +} diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go new file mode 100644 index 00000000..fc4acf8d --- /dev/null +++ b/pkg/controller/controller.go @@ -0,0 +1,21 @@ +package controller + +import ( + "context" + + "github.com/lyft/flinkk8soperator/pkg/controller/config" + "sigs.k8s.io/controller-runtime/pkg/manager" +) + +// AddToManagerFuncs is a list of functions to add all Controllers to the Manager +var AddToManagerFuncs []func(context.Context, manager.Manager, config.RuntimeConfig) error + +// AddToManager adds all Controllers to the Manager +func AddToManager(ctx context.Context, m manager.Manager, runtimeCfg config.RuntimeConfig) error { + for _, f := range AddToManagerFuncs { + if err := f(ctx, m, runtimeCfg); err != nil { + return err + } + } + return nil +} diff --git a/pkg/controller/errors/codes.go b/pkg/controller/errors/codes.go new file mode 100644 index 00000000..fbd69457 --- /dev/null +++ b/pkg/controller/errors/codes.go @@ -0,0 +1,10 @@ +package errors + +type ErrorCode = string + +const ( + IllegalStateError ErrorCode = "IllegalStateError" + CausedByError ErrorCode = "CausedByError" + BadJobSpecificationError ErrorCode = "BadJobSpecificationError" + ReconciliationNeeded ErrorCode = "ReconciliationNeeded" +) diff --git a/pkg/controller/errors/error.go b/pkg/controller/errors/error.go new file mode 100644 index 00000000..d9e0f713 --- /dev/null +++ b/pkg/controller/errors/error.go @@ -0,0 +1,56 @@ +package errors + +import ( + "fmt" +) + +type ErrorMessage = string + +type FlinkOperatorError struct { + Code ErrorCode + Message ErrorMessage +} + +func (w *FlinkOperatorError) Error() string { + return fmt.Sprintf("ErrorCode: [%v] Reason: [%v]", w.Code, w.Message) +} + +type FlinkOperatorErrorWithCause struct { + *FlinkOperatorError + cause error +} + +func (w *FlinkOperatorErrorWithCause) Error() string { + return fmt.Sprintf("%v. Caused By [%v]", w.FlinkOperatorError.Error(), w.cause) +} + +func (w *FlinkOperatorErrorWithCause) Cause() error { + return w.cause +} + +func errorf(c ErrorCode, msgFmt string, args ...interface{}) *FlinkOperatorError { + return &FlinkOperatorError{ + Code: c, + Message: fmt.Sprintf(msgFmt, args...), + } +} + +func Errorf(c ErrorCode, msgFmt string, args ...interface{}) error { + return errorf(c, msgFmt, args...) +} + +func WrapErrorf(c ErrorCode, cause error, msgFmt string, args ...interface{}) error { + return &FlinkOperatorErrorWithCause{ + FlinkOperatorError: errorf(c, msgFmt, args...), + cause: cause, + } +} + +func IsReconciliationNeeded(err error) bool { + if fErr, ok := err.(*FlinkOperatorError); ok { + if fErr.Code == ReconciliationNeeded { + return true + } + } + return false +} diff --git a/pkg/controller/flink/client/api.go b/pkg/controller/flink/client/api.go new file mode 100644 index 00000000..fb01a55b --- /dev/null +++ b/pkg/controller/flink/client/api.go @@ -0,0 +1,368 @@ +package client + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "net/http" + + "github.com/go-resty/resty" + "github.com/lyft/flinkk8soperator/pkg/controller/config" + "github.com/lyft/flytestdlib/logger" + "github.com/lyft/flytestdlib/promutils" + "github.com/lyft/flytestdlib/promutils/labeled" + "github.com/pkg/errors" +) + +const submitJobURL = "/jars/%s/run" +const savepointURL = "/jobs/%s/savepoints" +const jobURL = "/jobs/%s" +const checkSavepointStatusURL = "/jobs/%s/savepoints/%s" +const getJobsURL = "/jobs" +const getJobsOverviewURL = "/jobs/%s" +const getJobConfigURL = "/jobs/%s/config" +const getOverviewURL = "/overview" +const checkpointsURL = "/jobs/%s/checkpoints" +const taskmanagersURL = "/taskmanagers" +const httpGet = "GET" +const httpPost = "POST" +const httpPatch = "PATCH" +const retryCount = 3 +const timeOut = 5 * time.Second + +type FlinkAPIInterface interface { + CancelJobWithSavepoint(ctx context.Context, url string, jobID string) (string, error) + ForceCancelJob(ctx context.Context, url string, jobID string) error + SubmitJob(ctx context.Context, url string, jarID string, submitJobRequest SubmitJobRequest) (*SubmitJobResponse, error) + CheckSavepointStatus(ctx context.Context, url string, jobID, triggerID string) (*SavepointResponse, error) + GetJobs(ctx context.Context, url string) (*GetJobsResponse, error) + GetClusterOverview(ctx context.Context, url string) (*ClusterOverviewResponse, error) + GetLatestCheckpoint(ctx context.Context, url string, jobID string) (*CheckpointStatistics, error) + GetJobConfig(ctx context.Context, url string, jobID string) (*JobConfigResponse, error) + GetTaskManagers(ctx context.Context, url string) (*TaskManagersResponse, error) + GetCheckpointCounts(ctx context.Context, url string, jobID string) (*CheckpointResponse, error) + GetJobOverview(ctx context.Context, url string, jobID string) (*FlinkJobOverview, error) +} + +type FlinkJobManagerClient struct { + client *resty.Client + metrics *flinkJobManagerClientMetrics +} + +type flinkJobManagerClientMetrics struct { + scope promutils.Scope + submitJobSuccessCounter labeled.Counter + submitJobFailureCounter labeled.Counter + cancelJobSuccessCounter labeled.Counter + cancelJobFailureCounter labeled.Counter + forceCancelJobSuccessCounter labeled.Counter + forceCancelJobFailureCounter labeled.Counter + checkSavepointSuccessCounter labeled.Counter + checkSavepointFailureCounter labeled.Counter + getJobsSuccessCounter labeled.Counter + getJobsFailureCounter labeled.Counter + getJobConfigSuccessCounter labeled.Counter + getJobConfigFailureCounter labeled.Counter + getClusterSuccessCounter labeled.Counter + getClusterFailureCounter labeled.Counter + getCheckpointsSuccessCounter labeled.Counter + getCheckpointsFailureCounter labeled.Counter +} + +func newFlinkJobManagerClientMetrics(scope promutils.Scope) *flinkJobManagerClientMetrics { + flinkJmClientScope := scope.NewSubScope("flink_jm_client") + return &flinkJobManagerClientMetrics{ + scope: scope, + submitJobSuccessCounter: labeled.NewCounter("submit_job_success", "Flink job submission successful", flinkJmClientScope), + submitJobFailureCounter: labeled.NewCounter("submit_job_failure", "Flink job submission failed", flinkJmClientScope), + cancelJobSuccessCounter: labeled.NewCounter("cancel_job_success", "Flink job cancellation successful", flinkJmClientScope), + cancelJobFailureCounter: labeled.NewCounter("cancel_job_failure", "Flink job cancellation failed", flinkJmClientScope), + forceCancelJobSuccessCounter: labeled.NewCounter("force_cancel_job_success", "Flink forced job cancellation successful", flinkJmClientScope), + forceCancelJobFailureCounter: labeled.NewCounter("force_cancel_job_failure", "Flink forced job cancellation failed", flinkJmClientScope), + checkSavepointSuccessCounter: labeled.NewCounter("check_savepoint_status_success", "Flink check savepoint status successful", flinkJmClientScope), + checkSavepointFailureCounter: labeled.NewCounter("check_savepoint_status_failure", "Flink check savepoint status failed", flinkJmClientScope), + getJobsSuccessCounter: labeled.NewCounter("get_jobs_success", "Get flink jobs succeeded", flinkJmClientScope), + getJobsFailureCounter: labeled.NewCounter("get_jobs_failure", "Get flink jobs failed", flinkJmClientScope), + getJobConfigSuccessCounter: labeled.NewCounter("get_job_config_success", "Get flink job config succeeded", flinkJmClientScope), + getJobConfigFailureCounter: labeled.NewCounter("get_job_config_failure", "Get flink job config failed", flinkJmClientScope), + getClusterSuccessCounter: labeled.NewCounter("get_cluster_success", "Get cluster overview succeeded", flinkJmClientScope), + getClusterFailureCounter: labeled.NewCounter("get_cluster_failure", "Get cluster overview failed", flinkJmClientScope), + getCheckpointsSuccessCounter: labeled.NewCounter("get_checkpoints_success", "Get checkpoint request succeeded", flinkJmClientScope), + getCheckpointsFailureCounter: labeled.NewCounter("get_checkpoints_failed", "Get checkpoint request failed", flinkJmClientScope), + } +} + +func (c *FlinkJobManagerClient) GetJobConfig(ctx context.Context, url, jobID string) (*JobConfigResponse, error) { + path := fmt.Sprintf(getJobConfigURL, jobID) + url = url + path + + response, err := c.executeRequest(httpGet, url, nil) + if err != nil { + c.metrics.getJobConfigFailureCounter.Inc(ctx) + return nil, errors.Wrap(err, "GetJobConfig API request failed") + } + + if response != nil && !response.IsSuccess() { + c.metrics.getJobConfigFailureCounter.Inc(ctx) + logger.Errorf(ctx, fmt.Sprintf("Get Jobconfig failed with response %v", response)) + return nil, errors.New(fmt.Sprintf("Get Jobconfig failed with status %v", response.Status())) + } + var jobConfigResponse JobConfigResponse + if err := json.Unmarshal(response.Body(), &jobConfigResponse); err != nil { + logger.Errorf(ctx, "Unable to Unmarshal jobPlanResponse %v, err: %v", response, err) + return nil, err + } + c.metrics.getJobConfigSuccessCounter.Inc(ctx) + return &jobConfigResponse, nil +} + +func (c *FlinkJobManagerClient) GetClusterOverview(ctx context.Context, url string) (*ClusterOverviewResponse, error) { + url = url + getOverviewURL + response, err := c.executeRequest(httpGet, url, nil) + if err != nil { + c.metrics.getClusterFailureCounter.Inc(ctx) + return nil, errors.Wrap(err, "GetClusterOverview API request failed") + } + if response != nil && !response.IsSuccess() { + c.metrics.getClusterFailureCounter.Inc(ctx) + if response.StatusCode() != int(http.StatusNotFound) || response.StatusCode() != int(http.StatusServiceUnavailable) { + logger.Errorf(ctx, fmt.Sprintf("Get cluster overview failed with response %v", response)) + } + return nil, errors.New(fmt.Sprintf("Get cluster overview failed with status %v", response.Status())) + } + var clusterOverviewResponse ClusterOverviewResponse + if err = json.Unmarshal(response.Body(), &clusterOverviewResponse); err != nil { + logger.Errorf(ctx, "Unable to Unmarshal clusterOverviewResponse %v, err: %v", response, err) + return nil, err + } + c.metrics.getClusterSuccessCounter.Inc(ctx) + return &clusterOverviewResponse, nil +} + +// Helper method to execute the requests +func (c *FlinkJobManagerClient) executeRequest( + method string, url string, payload interface{}) (*resty.Response, error) { + var resp *resty.Response + var err error + if method == httpGet { + resp, err = c.client.R().Get(url) + } else if method == httpPatch { + resp, err = c.client.R().Patch(url) + } else if method == httpPost { + resp, err = c.client.R(). + SetHeader("Content-Type", "application/json"). + SetBody(payload). + Post(url) + } else { + return nil, errors.New(fmt.Sprintf("Invalid method %s in request", method)) + } + return resp, err +} + +func (c *FlinkJobManagerClient) CancelJobWithSavepoint(ctx context.Context, url string, jobID string) (string, error) { + path := fmt.Sprintf(savepointURL, jobID) + + url = url + path + cancelJobRequest := CancelJobRequest{ + CancelJob: true, + } + response, err := c.executeRequest(httpPost, url, cancelJobRequest) + if err != nil { + c.metrics.cancelJobFailureCounter.Inc(ctx) + return "", errors.Wrap(err, "Cancel job API request failed") + } + if response != nil && !response.IsSuccess() { + c.metrics.cancelJobFailureCounter.Inc(ctx) + logger.Errorf(ctx, fmt.Sprintf("Cancel job failed with response %v", response)) + return "", errors.New(fmt.Sprintf("Cancel job failed with status %v", response.Status())) + } + var cancelJobResponse CancelJobResponse + if err = json.Unmarshal(response.Body(), &cancelJobResponse); err != nil { + logger.Errorf(ctx, "Unable to Unmarshal cancelJobResponse %v, err: %v", response, err) + return "", err + } + c.metrics.cancelJobSuccessCounter.Inc(ctx) + return cancelJobResponse.TriggerID, nil +} + +func (c *FlinkJobManagerClient) ForceCancelJob(ctx context.Context, url string, jobID string) error { + path := fmt.Sprintf(jobURL, jobID) + + url = url + path + "?mode=cancel" + + response, err := c.executeRequest(httpPatch, url, nil) + if err != nil { + c.metrics.forceCancelJobFailureCounter.Inc(ctx) + return errors.Wrap(err, "Force cancel job API request failed") + } + if response != nil && !response.IsSuccess() { + c.metrics.forceCancelJobFailureCounter.Inc(ctx) + logger.Errorf(ctx, fmt.Sprintf("Force cancel job failed with response %v", response)) + return errors.New(fmt.Sprintf("Force cancel job failed with status %v", response.Status())) + } + + c.metrics.forceCancelJobFailureCounter.Inc(ctx) + return nil +} + +func (c *FlinkJobManagerClient) SubmitJob(ctx context.Context, url string, jarID string, submitJobRequest SubmitJobRequest) (*SubmitJobResponse, error) { + path := fmt.Sprintf(submitJobURL, jarID) + url = url + path + + response, err := c.executeRequest(httpPost, url, submitJobRequest) + if err != nil { + c.metrics.submitJobFailureCounter.Inc(ctx) + return nil, errors.Wrap(err, "Submit job API request failed") + } + if response != nil && !response.IsSuccess() { + c.metrics.submitJobFailureCounter.Inc(ctx) + logger.Warnf(ctx, fmt.Sprintf("Job submission failed with response %v", response)) + return nil, errors.New(fmt.Sprintf("Job submission failed with status %v\n%s", + response.Status(), string(response.Body()))) + } + var submitJobResponse SubmitJobResponse + if err = json.Unmarshal(response.Body(), &submitJobResponse); err != nil { + logger.Errorf(ctx, "Unable to Unmarshal submitJobResponse %v, err: %v", response, err) + return nil, err + } + + c.metrics.submitJobSuccessCounter.Inc(ctx) + return &submitJobResponse, nil +} + +func (c *FlinkJobManagerClient) CheckSavepointStatus(ctx context.Context, url string, jobID, triggerID string) (*SavepointResponse, error) { + path := fmt.Sprintf(checkSavepointStatusURL, jobID, triggerID) + url = url + path + + response, err := c.executeRequest(httpGet, url, nil) + if err != nil { + c.metrics.checkSavepointFailureCounter.Inc(ctx) + return nil, errors.Wrap(err, "Check savepoint status API request failed") + } + if response != nil && !response.IsSuccess() { + c.metrics.checkSavepointFailureCounter.Inc(ctx) + logger.Errorf(ctx, fmt.Sprintf("Check savepoint status failed with response %v", response)) + return nil, errors.New(fmt.Sprintf("Check savepoint status failed with status %v", response.Status())) + } + var savepointResponse SavepointResponse + if err = json.Unmarshal(response.Body(), &savepointResponse); err != nil { + logger.Errorf(ctx, "Unable to Unmarshal savepointResponse %v, err: %v", response, err) + return nil, err + } + c.metrics.cancelJobSuccessCounter.Inc(ctx) + return &savepointResponse, nil +} + +func (c *FlinkJobManagerClient) GetJobs(ctx context.Context, url string) (*GetJobsResponse, error) { + url = url + getJobsURL + response, err := c.executeRequest(httpGet, url, nil) + if err != nil { + c.metrics.getJobsFailureCounter.Inc(ctx) + return nil, errors.Wrap(err, "Get jobs API request failed") + } + if response != nil && !response.IsSuccess() { + c.metrics.getJobsFailureCounter.Inc(ctx) + logger.Errorf(ctx, fmt.Sprintf("GetJobs failed with response %v", response)) + return nil, errors.New(fmt.Sprintf("GetJobs request failed with status %v", response.Status())) + } + var getJobsResponse GetJobsResponse + if err = json.Unmarshal(response.Body(), &getJobsResponse); err != nil { + logger.Errorf(ctx, "%v", getJobsResponse) + logger.Errorf(ctx, "Unable to Unmarshal getJobsResponse %v, err: %v", response, err) + return nil, err + } + c.metrics.getJobsSuccessCounter.Inc(ctx) + return &getJobsResponse, nil +} + +func (c *FlinkJobManagerClient) GetLatestCheckpoint(ctx context.Context, url string, jobID string) (*CheckpointStatistics, error) { + endpoint := fmt.Sprintf(url+checkpointsURL, jobID) + response, err := c.executeRequest(httpGet, endpoint, nil) + if err != nil { + c.metrics.getCheckpointsFailureCounter.Inc(ctx) + return nil, errors.Wrap(err, "get checkpoints failed") + } + if response != nil && !response.IsSuccess() { + c.metrics.getCheckpointsFailureCounter.Inc(ctx) + return nil, errors.New(fmt.Sprintf("get checkpoints failed with response %v", response)) + } + + var checkpointResponse CheckpointResponse + if err = json.Unmarshal(response.Body(), &checkpointResponse); err != nil { + logger.Errorf(ctx, "Failed to unmarshal checkpointResponse %v, err %v", response, err) + } + + c.metrics.getCheckpointsSuccessCounter.Inc(ctx) + return checkpointResponse.Latest.Completed, nil +} + +func (c *FlinkJobManagerClient) GetTaskManagers(ctx context.Context, url string) (*TaskManagersResponse, error) { + endpoint := url + taskmanagersURL + response, err := c.executeRequest(httpGet, endpoint, nil) + if err != nil { + return nil, errors.Wrap(err, "get taskmanagers failed") + } + + if response != nil && !response.IsSuccess() { + return nil, errors.New(fmt.Sprintf("get taskmanagers failed with response %v", response)) + } + + var taskmanagerResponse TaskManagersResponse + if err = json.Unmarshal(response.Body(), &taskmanagerResponse); err != nil { + logger.Errorf(ctx, "Failed to unmarshal taskmanagerResponse %v, err %v", response, err) + } + + return &taskmanagerResponse, nil + +} + +func (c *FlinkJobManagerClient) GetCheckpointCounts(ctx context.Context, url string, jobID string) (*CheckpointResponse, error) { + endpoint := fmt.Sprintf(url+checkpointsURL, jobID) + response, err := c.executeRequest(httpGet, endpoint, nil) + if err != nil { + c.metrics.getCheckpointsFailureCounter.Inc(ctx) + return nil, errors.Wrap(err, "get checkpoints failed") + } + if response != nil && !response.IsSuccess() { + c.metrics.getCheckpointsFailureCounter.Inc(ctx) + return nil, errors.New(fmt.Sprintf("get checkpoints failed with response %v", response)) + } + + var checkpointResponse CheckpointResponse + if err = json.Unmarshal(response.Body(), &checkpointResponse); err != nil { + logger.Errorf(ctx, "Failed to unmarshal checkpointResponse %v, err %v", response, err) + } + + c.metrics.getCheckpointsSuccessCounter.Inc(ctx) + return &checkpointResponse, nil +} + +func (c *FlinkJobManagerClient) GetJobOverview(ctx context.Context, url string, jobID string) (*FlinkJobOverview, error) { + endpoint := fmt.Sprintf(url+getJobsOverviewURL, jobID) + response, err := c.executeRequest(httpGet, endpoint, nil) + if err != nil { + return nil, errors.Wrap(err, "get job overview failed") + } + if response != nil && !response.IsSuccess() { + c.metrics.getCheckpointsFailureCounter.Inc(ctx) + return nil, errors.New(fmt.Sprintf("get job overview failed with response %v", response)) + } + + var jobOverviewResponse FlinkJobOverview + if err = json.Unmarshal(response.Body(), &jobOverviewResponse); err != nil { + logger.Errorf(ctx, "Failed to unmarshal FlinkJob %v, err %v", response, err) + } + + return &jobOverviewResponse, nil +} + +func NewFlinkJobManagerClient(config config.RuntimeConfig) FlinkAPIInterface { + client := resty.SetRetryCount(retryCount).SetTimeout(timeOut) + metrics := newFlinkJobManagerClientMetrics(config.MetricsScope) + return &FlinkJobManagerClient{ + client: client, + metrics: metrics, + } +} diff --git a/pkg/controller/flink/client/api_test.go b/pkg/controller/flink/client/api_test.go new file mode 100644 index 00000000..5f34e1b6 --- /dev/null +++ b/pkg/controller/flink/client/api_test.go @@ -0,0 +1,478 @@ +package client + +import ( + "context" + "testing" + + "github.com/go-resty/resty" + "github.com/jarcoal/httpmock" + mockScope "github.com/lyft/flytestdlib/promutils" + "github.com/lyft/flytestdlib/promutils/labeled" + "github.com/stretchr/testify/assert" + + "strings" + + "github.com/lyft/flinkk8soperator/pkg/controller/common" + "github.com/lyft/flinkk8soperator/pkg/controller/config" +) + +const testURL = "http://abc.com" +const invalidTestResponse = "invalid response" +const fakeJobsURL = "http://abc.com/jobs" +const fakeOverviewURL = "http://abc.com/overview" +const fakeJobConfigURL = "http://abc.com/jobs/1/config" +const fakeSavepointURL = "http://abc.com/jobs/1/savepoints/2" +const fakeSubmitURL = "http://abc.com/jars/1/run" +const fakeCancelURL = "http://abc.com/jobs/1/savepoints" +const fakeTaskmanagersURL = "http://abc.com/taskmanagers" + +func getTestClient() FlinkJobManagerClient { + client := resty.SetRetryCount(1) + return FlinkJobManagerClient{ + client: client, + } +} + +func getTestJobManagerClient() FlinkAPIInterface { + testScope := mockScope.NewTestScope() + labeled.SetMetricKeys(common.GetValidLabelNames()...) + return NewFlinkJobManagerClient(config.RuntimeConfig{ + MetricsScope: testScope, + }) +} + +func TestGetJobsHappyCase(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + response := GetJobsResponse{ + Jobs: []FlinkJob{ + { + JobID: "j1", + }, + }, + } + responder, _ := httpmock.NewJsonResponder(200, response) + httpmock.RegisterResponder("GET", fakeJobsURL, responder) + + client := getTestJobManagerClient() + resp, err := client.GetJobs(ctx, testURL) + assert.Equal(t, response, *resp) + assert.NoError(t, err) +} + +func TestGetJobsInvalidResponse(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + responder, _ := httpmock.NewJsonResponder(200, invalidTestResponse) + httpmock.RegisterResponder("GET", fakeJobsURL, responder) + + client := getTestJobManagerClient() + _, err := client.GetJobs(ctx, testURL) + assert.NotNil(t, err) +} + +func TestGetJobs500Response(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + responder, _ := httpmock.NewJsonResponder(500, nil) + httpmock.RegisterResponder("GET", fakeJobsURL, responder) + + client := getTestJobManagerClient() + resp, err := client.GetJobs(ctx, testURL) + assert.Nil(t, resp) + assert.EqualError(t, err, "GetJobs request failed with status 500") +} + +func TestGetJobsError(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + httpmock.RegisterResponder("GET", fakeJobsURL, nil) + + client := getTestJobManagerClient() + resp, err := client.GetJobs(ctx, testURL) + assert.Nil(t, resp) + assert.NotNil(t, err) + assert.True(t, strings.HasPrefix(err.Error(), "Get jobs API request failed")) +} + +func TestGetJobsFlinkJobUnmarshal(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + mockJobsResponse := `{"jobs":[{"id":"abc","status":"RUNNING"}]}` + responder := httpmock.NewStringResponder(200, mockJobsResponse) + httpmock.RegisterResponder("GET", fakeJobsURL, responder) + + client := getTestJobManagerClient() + resp, err := client.GetJobs(ctx, testURL) + assert.NotNil(t, resp) + assert.Nil(t, err) + assert.Equal(t, resp.Jobs[0].Status, JobState("RUNNING")) + assert.Equal(t, resp.Jobs[0].JobID, "abc") +} + +func TestGetClusterOverviewHappyCase(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + response := ClusterOverviewResponse{ + TaskManagerCount: 4, + } + responder, _ := httpmock.NewJsonResponder(200, response) + httpmock.RegisterResponder("GET", fakeOverviewURL, responder) + + client := getTestJobManagerClient() + resp, err := client.GetClusterOverview(ctx, testURL) + assert.Equal(t, response, *resp) + assert.NoError(t, err) +} + +func TestGetClusterOverviewInvalidResponse(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + responder, _ := httpmock.NewJsonResponder(200, invalidTestResponse) + httpmock.RegisterResponder("GET", fakeOverviewURL, responder) + + client := getTestJobManagerClient() + resp, err := client.GetClusterOverview(ctx, testURL) + assert.Nil(t, resp) + assert.NotNil(t, err) +} + +func TestGetCluster500Response(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + responder, _ := httpmock.NewJsonResponder(500, nil) + httpmock.RegisterResponder("GET", fakeOverviewURL, responder) + + client := getTestJobManagerClient() + resp, err := client.GetClusterOverview(ctx, testURL) + assert.Nil(t, resp) + assert.EqualError(t, err, "Get cluster overview failed with status 500") +} + +func TestGetCluster503Response(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + responder, _ := httpmock.NewJsonResponder(503, nil) + httpmock.RegisterResponder("GET", fakeOverviewURL, responder) + + client := getTestJobManagerClient() + resp, err := client.GetClusterOverview(ctx, testURL) + assert.Nil(t, resp) + assert.EqualError(t, err, "Get cluster overview failed with status 503") +} + +func TestGetClusterOverviewError(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + httpmock.RegisterResponder("GET", fakeOverviewURL, nil) + + client := getTestJobManagerClient() + resp, err := client.GetClusterOverview(ctx, testURL) + assert.Nil(t, resp) + assert.NotNil(t, err) + assert.True(t, strings.HasPrefix(err.Error(), "GetClusterOverview API request failed")) +} + +func TestGetJobConfigHappyCase(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + response := JobConfigResponse{ + JobID: "j1", + } + responder, _ := httpmock.NewJsonResponder(200, response) + httpmock.RegisterResponder("GET", fakeJobConfigURL, responder) + + client := getTestJobManagerClient() + resp, err := client.GetJobConfig(ctx, testURL, "1") + assert.Equal(t, response, *resp) + assert.NoError(t, err) +} + +func TestGetJobConfigInvalidResponse(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + responder, _ := httpmock.NewJsonResponder(200, invalidTestResponse) + httpmock.RegisterResponder("GET", fakeJobConfigURL, responder) + + client := getTestJobManagerClient() + resp, err := client.GetJobConfig(ctx, testURL, "1") + assert.Nil(t, resp) + assert.NotNil(t, err) +} + +func TestGetJobConfig500Response(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + responder, _ := httpmock.NewJsonResponder(500, nil) + httpmock.RegisterResponder("GET", fakeJobConfigURL, responder) + + client := getTestJobManagerClient() + resp, err := client.GetJobConfig(ctx, testURL, "1") + assert.Nil(t, resp) + assert.EqualError(t, err, "Get Jobconfig failed with status 500") +} + +func TestGetJobConfigError(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + httpmock.RegisterResponder("GET", fakeJobConfigURL, nil) + + client := getTestJobManagerClient() + resp, err := client.GetJobConfig(ctx, testURL, "1") + assert.Nil(t, resp) + assert.NotNil(t, err) + assert.True(t, strings.HasPrefix(err.Error(), "GetJobConfig API request failed")) +} + +func TestCheckSavepointHappyCase(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + response := SavepointResponse{ + SavepointStatus: SavepointStatusResponse{ + Status: SavePointInProgress, + }, + } + responder, _ := httpmock.NewJsonResponder(200, response) + httpmock.RegisterResponder("GET", fakeSavepointURL, responder) + + client := getTestJobManagerClient() + resp, err := client.CheckSavepointStatus(ctx, testURL, "1", "2") + assert.Equal(t, response, *resp) + assert.NoError(t, err) +} + +func TestCheckSavepointInvalidResponse(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + responder, _ := httpmock.NewJsonResponder(200, invalidTestResponse) + httpmock.RegisterResponder("GET", fakeSavepointURL, responder) + + client := getTestJobManagerClient() + resp, err := client.CheckSavepointStatus(ctx, testURL, "1", "2") + assert.Nil(t, resp) + assert.NotNil(t, err) +} + +func TestCheckSavepoint500Response(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + responder, _ := httpmock.NewJsonResponder(500, nil) + httpmock.RegisterResponder("GET", fakeSavepointURL, responder) + + client := getTestJobManagerClient() + resp, err := client.CheckSavepointStatus(ctx, testURL, "1", "2") + assert.Nil(t, resp) + assert.EqualError(t, err, "Check savepoint status failed with status 500") +} + +func TestCheckSavepointError(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + httpmock.RegisterResponder("GET", fakeSavepointURL, nil) + + client := getTestJobManagerClient() + resp, err := client.CheckSavepointStatus(ctx, testURL, "1", "2") + assert.Nil(t, resp) + assert.NotNil(t, err) + assert.True(t, strings.HasPrefix(err.Error(), "Check savepoint status API request failed")) +} + +func TestSubmitJobHappyCase(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + response := SubmitJobResponse{ + JobID: "1", + } + responder, _ := httpmock.NewJsonResponder(200, response) + httpmock.RegisterResponder("POST", fakeSubmitURL, responder) + + client := getTestJobManagerClient() + resp, err := client.SubmitJob(ctx, testURL, "1", SubmitJobRequest{ + Parallelism: 10, + }) + assert.Equal(t, response, *resp) + assert.NoError(t, err) +} + +func TestSubmitJobInvalidResponse(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + responder, _ := httpmock.NewJsonResponder(200, invalidTestResponse) + httpmock.RegisterResponder("POST", fakeSubmitURL, responder) + + client := getTestJobManagerClient() + resp, err := client.SubmitJob(ctx, testURL, "1", SubmitJobRequest{ + Parallelism: 10, + }) + assert.Nil(t, resp) + assert.NotNil(t, err) +} + +func TestSubmitJob500Response(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + responder := httpmock.NewStringResponder(500, "could not submit") + httpmock.RegisterResponder("POST", fakeSubmitURL, responder) + + client := getTestJobManagerClient() + resp, err := client.SubmitJob(ctx, testURL, "1", SubmitJobRequest{ + Parallelism: 10, + }) + assert.Nil(t, resp) + assert.EqualError(t, err, "Job submission failed with status 500\ncould not submit") +} + +func TestSubmitJobError(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + httpmock.RegisterResponder("POST", fakeSubmitURL, nil) + + client := getTestJobManagerClient() + resp, err := client.SubmitJob(ctx, testURL, "1", SubmitJobRequest{ + Parallelism: 10, + }) + assert.Nil(t, resp) + assert.NotNil(t, err) + assert.True(t, strings.HasPrefix(err.Error(), "Submit job API request failed")) +} + +func TestCancelJobHappyCase(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + response := CancelJobResponse{ + TriggerID: "133", + } + responder, _ := httpmock.NewJsonResponder(203, response) + httpmock.RegisterResponder("POST", fakeCancelURL, responder) + + client := getTestJobManagerClient() + resp, err := client.CancelJobWithSavepoint(ctx, testURL, "1") + assert.Equal(t, response.TriggerID, resp) + assert.NoError(t, err) +} + +func TestCancelJobInvalidResponse(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + responder, _ := httpmock.NewJsonResponder(200, invalidTestResponse) + httpmock.RegisterResponder("POST", fakeCancelURL, responder) + + client := getTestJobManagerClient() + resp, err := client.CancelJobWithSavepoint(ctx, testURL, "1") + assert.Empty(t, resp) + assert.NotNil(t, err) +} + +func TestCancelJob500Response(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + responder, _ := httpmock.NewJsonResponder(500, nil) + httpmock.RegisterResponder("POST", fakeCancelURL, responder) + + client := getTestJobManagerClient() + resp, err := client.CancelJobWithSavepoint(ctx, testURL, "1") + assert.Empty(t, resp) + assert.EqualError(t, err, "Cancel job failed with status 500") +} + +func TestCancelJobError(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + httpmock.RegisterResponder("POST", fakeCancelURL, nil) + + client := getTestJobManagerClient() + resp, err := client.CancelJobWithSavepoint(ctx, testURL, "1") + assert.Empty(t, resp) + assert.NotNil(t, err) + assert.True(t, strings.HasPrefix(err.Error(), "Cancel job API request failed")) +} + +func TestHttpGetNon200Response(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + response := GetJobsResponse{ + Jobs: []FlinkJob{ + { + JobID: "j1", + }, + }, + } + responder, _ := httpmock.NewJsonResponder(500, response) + httpmock.RegisterResponder("GET", fakeJobsURL, responder) + + client := getTestJobManagerClient() + _, err := client.GetJobs(ctx, testURL) + assert.NotNil(t, err) + assert.EqualError(t, err, "GetJobs request failed with status 500") +} + +func TestClientInvalidMethod(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + + client := getTestClient() + _, err := client.executeRequest("random", testURL, nil) + assert.NotNil(t, err) + assert.EqualError(t, err, "Invalid method random in request") +} + +func TestGetTaskManagersValidResponse(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + response := TaskManagersResponse{ + TaskManagers: []TaskManagerStats{ + { + TimeSinceLastHeartbeat: 1555611965910, + SlotsNumber: 3, + FreeSlots: 0, + }, + }, + } + responder, _ := httpmock.NewJsonResponder(200, response) + httpmock.RegisterResponder("GET", fakeTaskmanagersURL, responder) + + client := getTestJobManagerClient() + resp, err := client.GetTaskManagers(ctx, testURL) + assert.Equal(t, response, *resp) + assert.NoError(t, err) +} + +func TestGetTaskManagersInvalidResponse(t *testing.T) { + httpmock.Activate() + defer httpmock.DeactivateAndReset() + ctx := context.Background() + responder, _ := httpmock.NewJsonResponder(200, invalidTestResponse) + httpmock.RegisterResponder("GET", fakeTaskmanagersURL, responder) + + client := getTestJobManagerClient() + _, err := client.GetJobs(ctx, testURL) + assert.NotNil(t, err) +} diff --git a/pkg/controller/flink/client/entities.go b/pkg/controller/flink/client/entities.go new file mode 100644 index 00000000..193b25c6 --- /dev/null +++ b/pkg/controller/flink/client/entities.go @@ -0,0 +1,144 @@ +package client + +type SavepointStatus string + +const ( + SavePointInvalid SavepointStatus = "" + SavePointInProgress SavepointStatus = "IN_PROGRESS" + SavePointCompleted SavepointStatus = "COMPLETED" +) + +type CheckpointStatus string + +const ( + CheckpointInProgress CheckpointStatus = "IN_PROGRESS" + CheckpointFailed CheckpointStatus = "FAILED" + CheckpointCompleted CheckpointStatus = "COMPLETED" +) + +type JobState string + +const ( + Created JobState = "CREATED" + Running JobState = "RUNNING" + Failing JobState = "FAILING" + Failed JobState = "FAILED" + Cancelling JobState = "CANCELLING" + Canceled JobState = "CANCELED" + Finished JobState = "FINISHED" + Restarting JobState = "RESTARTING" + Suspended JobState = "SUSPENDED" + Reconciling JobState = "RECONCILING" +) + +type CancelJobRequest struct { + CancelJob bool `json:"cancel-job"` + TargetDirectory string `json:"target-directory,omitempty"` +} + +type SubmitJobRequest struct { + SavepointPath string `json:"savepointPath"` + Parallelism int32 `json:"parallelism"` + ProgramArgs string `json:"programArgs"` + EntryClass string `json:"entryClass"` +} + +type SavepointResponse struct { + SavepointStatus SavepointStatusResponse `json:"status"` + Operation SavepointOperationResponse `json:"operation"` +} + +type SavepointStatusResponse struct { + Status SavepointStatus `json:"id"` +} + +type SavepointOperationResponse struct { + Location string `json:"location"` + FailureCause FailureCause `json:"failure-cause"` +} + +type FailureCause struct { + Class string `json:"class"` + StackTrace string `json:"stack-trace"` +} + +type CancelJobResponse struct { + TriggerID string `json:"request-id"` +} + +type SubmitJobResponse struct { + JobID string `json:"jobid"` +} + +type GetJobsResponse struct { + Jobs []FlinkJob `json:"jobs"` +} + +type JobConfigResponse struct { + JobID string `json:"jid"` + ExecutionConfig JobExecutionConfig `json:"execution-config"` +} + +type JobExecutionConfig struct { + Parallelism int32 `json:"job-parallelism"` +} + +type FlinkJob struct { + JobID string `json:"id"` + Status JobState `json:"status"` +} + +type FlinkJobOverview struct { + JobID string `json:"jid"` + State JobState `json:"state"` + StartTime int64 `json:"start-time"` + EndTime int64 `json:"end-time"` +} + +type ClusterOverviewResponse struct { + TaskManagerCount int32 `json:"taskmanagers"` + SlotsAvailable int32 `json:"slots-available"` + NumberOfTaskSlots int32 `json:"slots-total"` +} + +type CheckpointStatistics struct { + ID uint `json:"id"` + Status CheckpointStatus `json:"status"` + IsSavepoint bool `json:"is_savepoint"` + TriggerTimestamp int64 `json:"trigger_timestamp"` + LatestAckTimestamp int64 `json:"latest_ack_timestamp"` + StateSize int64 `json:"state_size"` + EndToEndDuration int64 `json:"end_to_end_duration"` + AlignmentBuffered int64 `json:"alignment_buffered"` + NumSubtasks int64 `json:"num_subtasks"` + FailureTimestamp int64 `json:"failure_timestamp"` + FailureMessage string `json:"failure_message"` + ExternalPath string `json:"external_path"` + Discarded bool `json:"discarded"` + RestoredTimeStamp int64 `json:"restore_timestamp"` +} + +type LatestCheckpoints struct { + Completed *CheckpointStatistics `json:"completed,omitempty"` + Savepoint *CheckpointStatistics `json:"savepoint,omitempty"` + Failed *CheckpointStatistics `json:"failed,omitempty"` + Restored *CheckpointStatistics `json:"restored,omitempty"` +} + +type CheckpointResponse struct { + Counts map[string]int32 `json:"counts"` + Latest LatestCheckpoints `json:"latest"` + History []CheckpointStatistics `json:"history"` +} + +type TaskManagerStats struct { + Path string `json:"path"` + DataPort int32 `json:"dataPort"` + TimeSinceLastHeartbeat int64 `json:"timeSinceLastHeartbeat"` + SlotsNumber int32 `json:"slotsNumber"` + FreeSlots int32 `json:"freeSlots"` +} + +type TaskManagersResponse struct { + TaskManagers []TaskManagerStats `json:"taskmanagers"` +} diff --git a/pkg/controller/flink/client/mock/mock_api.go b/pkg/controller/flink/client/mock/mock_api.go new file mode 100644 index 00000000..67b7b773 --- /dev/null +++ b/pkg/controller/flink/client/mock/mock_api.go @@ -0,0 +1,110 @@ +package mock + +import ( + "context" + + "github.com/lyft/flinkk8soperator/pkg/controller/flink/client" +) + +type CancelJobWithSavepointFunc func(ctx context.Context, url string, jobID string) (string, error) +type ForceCancelJobFunc func(ctx context.Context, url string, jobID string) error +type SubmitJobFunc func(ctx context.Context, url string, jarID string, submitJobRequest client.SubmitJobRequest) (*client.SubmitJobResponse, error) +type CheckSavepointStatusFunc func(ctx context.Context, url string, jobID, triggerID string) (*client.SavepointResponse, error) +type GetJobsFunc func(ctx context.Context, url string) (*client.GetJobsResponse, error) +type GetClusterOverviewFunc func(ctx context.Context, url string) (*client.ClusterOverviewResponse, error) +type GetLatestCheckpointFunc func(ctx context.Context, url string, jobID string) (*client.CheckpointStatistics, error) +type GetJobConfigFunc func(ctx context.Context, url string, jobID string) (*client.JobConfigResponse, error) +type GetTaskManagersFunc func(ctx context.Context, url string) (*client.TaskManagersResponse, error) +type GetCheckpointCountsFunc func(ctx context.Context, url string, jobID string) (*client.CheckpointResponse, error) +type GetJobOverviewFunc func(ctx context.Context, url string, jobID string) (*client.FlinkJobOverview, error) + +type JobManagerClient struct { + CancelJobWithSavepointFunc CancelJobWithSavepointFunc + ForceCancelJobFunc ForceCancelJobFunc + SubmitJobFunc SubmitJobFunc + CheckSavepointStatusFunc CheckSavepointStatusFunc + GetJobsFunc GetJobsFunc + GetClusterOverviewFunc GetClusterOverviewFunc + GetJobConfigFunc GetJobConfigFunc + GetLatestCheckpointFunc GetLatestCheckpointFunc + GetTaskManagersFunc GetTaskManagersFunc + GetCheckpointCountsFunc GetCheckpointCountsFunc + GetJobOverviewFunc GetJobOverviewFunc +} + +func (m *JobManagerClient) SubmitJob(ctx context.Context, url string, jarID string, submitJobRequest client.SubmitJobRequest) (*client.SubmitJobResponse, error) { + if m.SubmitJobFunc != nil { + return m.SubmitJobFunc(ctx, url, jarID, submitJobRequest) + } + return nil, nil +} + +func (m *JobManagerClient) CancelJobWithSavepoint(ctx context.Context, url string, jobID string) (string, error) { + if m.CancelJobWithSavepointFunc != nil { + return m.CancelJobWithSavepointFunc(ctx, url, jobID) + } + return "", nil +} + +func (m *JobManagerClient) ForceCancelJob(ctx context.Context, url string, jobID string) error { + if m.ForceCancelJobFunc != nil { + return m.ForceCancelJobFunc(ctx, url, jobID) + } + return nil +} + +func (m *JobManagerClient) CheckSavepointStatus(ctx context.Context, url string, jobID, triggerID string) (*client.SavepointResponse, error) { + if m.CheckSavepointStatusFunc != nil { + return m.CheckSavepointStatusFunc(ctx, url, jobID, triggerID) + } + return nil, nil +} + +func (m *JobManagerClient) GetJobs(ctx context.Context, url string) (*client.GetJobsResponse, error) { + if m.GetJobsFunc != nil { + return m.GetJobsFunc(ctx, url) + } + return nil, nil +} + +func (m *JobManagerClient) GetClusterOverview(ctx context.Context, url string) (*client.ClusterOverviewResponse, error) { + if m.GetClusterOverviewFunc != nil { + return m.GetClusterOverviewFunc(ctx, url) + } + return nil, nil +} + +func (m *JobManagerClient) GetJobConfig(ctx context.Context, url string, jobID string) (*client.JobConfigResponse, error) { + if m.GetJobConfigFunc != nil { + return m.GetJobConfigFunc(ctx, url, jobID) + } + return nil, nil +} + +func (m *JobManagerClient) GetLatestCheckpoint(ctx context.Context, url string, jobID string) (*client.CheckpointStatistics, error) { + if m.GetLatestCheckpointFunc != nil { + return m.GetLatestCheckpointFunc(ctx, url, jobID) + } + return nil, nil +} + +func (m *JobManagerClient) GetTaskManagers(ctx context.Context, url string) (*client.TaskManagersResponse, error) { + if m.GetTaskManagersFunc != nil { + return m.GetTaskManagersFunc(ctx, url) + } + return nil, nil +} + +func (m *JobManagerClient) GetCheckpointCounts(ctx context.Context, url string, jobID string) (*client.CheckpointResponse, error) { + if m.GetCheckpointCountsFunc != nil { + return m.GetCheckpointCountsFunc(ctx, url, jobID) + } + return nil, nil +} + +func (m *JobManagerClient) GetJobOverview(ctx context.Context, url string, jobID string) (*client.FlinkJobOverview, error) { + if m.GetJobOverviewFunc != nil { + return m.GetJobOverviewFunc(ctx, url, jobID) + } + return nil, nil +} diff --git a/pkg/controller/flink/config.go b/pkg/controller/flink/config.go new file mode 100644 index 00000000..e988d7af --- /dev/null +++ b/pkg/controller/flink/config.go @@ -0,0 +1,120 @@ +package flink + +import ( + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "gopkg.in/yaml.v2" +) + +const ( + JobManagerDefaultReplicaCount = 1 + TaskManagerDefaultSlots = 16 + RPCDefaultPort = 6123 + QueryDefaultPort = 6124 + BlobDefaultPort = 6125 + UIDefaultPort = 8081 + MetricsQueryDefaultPort = 50101 + OffHeapMemoryDefaultFraction = 0.5 +) + +func firstNonNil(x *int32, y int32) int32 { + if x != nil { + return *x + } + return y +} + +func getValidFraction(x *float64, y float64) float64 { + if x != nil && *x >= float64(0) && *x <= float64(1) { + return *x + } + return y +} + +func getTaskmanagerSlots(app *v1alpha1.FlinkApplication) int32 { + return firstNonNil(app.Spec.TaskManagerConfig.TaskSlots, TaskManagerDefaultSlots) +} + +func getJobmanagerReplicas(app *v1alpha1.FlinkApplication) int32 { + return firstNonNil(app.Spec.JobManagerConfig.Replicas, JobManagerDefaultReplicaCount) +} + +func getRPCPort(app *v1alpha1.FlinkApplication) int32 { + return firstNonNil(app.Spec.RPCPort, RPCDefaultPort) +} + +func getUIPort(app *v1alpha1.FlinkApplication) int32 { + return firstNonNil(app.Spec.UIPort, UIDefaultPort) +} + +func getQueryPort(app *v1alpha1.FlinkApplication) int32 { + return firstNonNil(app.Spec.QueryPort, QueryDefaultPort) +} + +func getBlobPort(app *v1alpha1.FlinkApplication) int32 { + return firstNonNil(app.Spec.BlobPort, BlobDefaultPort) +} + +func getInternalMetricsQueryPort(app *v1alpha1.FlinkApplication) int32 { + return firstNonNil(app.Spec.MetricsQueryPort, MetricsQueryDefaultPort) +} + +func getTaskManagerMemory(application *v1alpha1.FlinkApplication) int64 { + tmResources := application.Spec.TaskManagerConfig.Resources + if tmResources == nil { + tmResources = &TaskManagerDefaultResources + } + tmMemory, _ := tmResources.Requests.Memory().AsInt64() + return tmMemory +} + +func getJobManagerMemory(application *v1alpha1.FlinkApplication) int64 { + jmResources := application.Spec.JobManagerConfig.Resources + if jmResources == nil { + jmResources = &JobManagerDefaultResources + } + jmMemory, _ := jmResources.Requests.Memory().AsInt64() + return jmMemory +} + +func getTaskManagerHeapMemory(app *v1alpha1.FlinkApplication) float64 { + offHeapMemoryFrac := getValidFraction(app.Spec.TaskManagerConfig.OffHeapMemoryFraction, OffHeapMemoryDefaultFraction) + tmMemory := float64(getTaskManagerMemory(app)) + heapMemoryBytes := tmMemory - (tmMemory * offHeapMemoryFrac) + heapMemoryMB := heapMemoryBytes / (1024 * 1024) + return heapMemoryMB +} + +func getJobManagerHeapMemory(app *v1alpha1.FlinkApplication) float64 { + offHeapMemoryFrac := getValidFraction(app.Spec.JobManagerConfig.OffHeapMemoryFraction, OffHeapMemoryDefaultFraction) + jmMemory := float64(getJobManagerMemory(app)) + heapMemoryBytes := jmMemory - (jmMemory * offHeapMemoryFrac) + heapMemoryMB := heapMemoryBytes / (1024 * 1024) + return heapMemoryMB +} + +// Renders the flink configuration overrides stored in FlinkApplication.FlinkConfig into a +// YAML string suitable for interpolating into flink-conf.yaml. +func renderFlinkConfig(app *v1alpha1.FlinkApplication) (string, error) { + config := app.Spec.FlinkConfig.DeepCopy() + if config == nil { + config = &v1alpha1.FlinkConfig{} + } + + // we will fill this in later using the versioned service + delete(*config, "jobmanager.rpc.address") + + (*config)["taskmanager.numberOfTaskSlots"] = getTaskmanagerSlots(app) + (*config)["jobmanager.rpc.port"] = getRPCPort(app) + (*config)["jobmanager.web.port"] = getUIPort(app) + (*config)["query.server.port"] = getQueryPort(app) + (*config)["blob.server.port"] = getBlobPort(app) + (*config)["metrics.internal.query-service.port"] = getInternalMetricsQueryPort(app) + (*config)["jobmanager.heap.size"] = getJobManagerHeapMemory(app) + (*config)["taskmanager.heap.size"] = getTaskManagerHeapMemory(app) + + b, err := yaml.Marshal(config) + if err != nil { + return "", err + } + return string(b), nil +} diff --git a/pkg/controller/flink/config_test.go b/pkg/controller/flink/config_test.go new file mode 100644 index 00000000..d8e7774f --- /dev/null +++ b/pkg/controller/flink/config_test.go @@ -0,0 +1,191 @@ +package flink + +import ( + "fmt" + "sort" + "strings" + "testing" + + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "github.com/stretchr/testify/assert" + coreV1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestRenderFlinkConfigOverrides(t *testing.T) { + taskSlots := int32(4) + blobPort := int32(1000) + offHeapMemoryFrac := 0.5 + + yaml, err := renderFlinkConfig(&v1alpha1.FlinkApplication{ + ObjectMeta: v1.ObjectMeta{ + Name: "test-app", + }, + Spec: v1alpha1.FlinkApplicationSpec{ + FlinkConfig: map[string]interface{}{ + "akka.timeout": "5s", + "taskmanager.network.memory.fraction": 0.1, + "taskmanager.network.request-backoff.max": 5000, + "jobmanager.rpc.address": "wrong-address", + }, + TaskManagerConfig: v1alpha1.TaskManagerConfig{ + TaskSlots: &taskSlots, + OffHeapMemoryFraction: &offHeapMemoryFrac, + }, + JobManagerConfig: v1alpha1.JobManagerConfig{ + OffHeapMemoryFraction: &offHeapMemoryFrac, + }, + BlobPort: &blobPort, + }, + Status: v1alpha1.FlinkApplicationStatus{ + Phase: v1alpha1.FlinkApplicationNew, + }, + }) + + if err != nil { + assert.NoError(t, err, "Got error producing config") + } + + lines := strings.Split(strings.TrimSpace(yaml), "\n") + sort.Strings(lines) + + expected := []string{ + "akka.timeout: 5s", + fmt.Sprintf("blob.server.port: %d", blobPort), + "jobmanager.heap.size: 1536", // defaults + fmt.Sprintf("jobmanager.rpc.port: %d", RPCDefaultPort), + fmt.Sprintf("jobmanager.web.port: %d", UIDefaultPort), + fmt.Sprintf("metrics.internal.query-service.port: %d", MetricsQueryDefaultPort), + fmt.Sprintf("query.server.port: %d", QueryDefaultPort), + "taskmanager.heap.size: 512", // defaults + "taskmanager.network.memory.fraction: 0.1", + "taskmanager.network.request-backoff.max: 5000", + fmt.Sprintf("taskmanager.numberOfTaskSlots: %d", taskSlots), + } + + assert.Equal(t, expected, lines) +} + +func TestGetTaskSlots(t *testing.T) { + app1 := v1alpha1.FlinkApplication{} + assert.Equal(t, int32(TaskManagerDefaultSlots), getTaskmanagerSlots(&app1)) + + app2 := v1alpha1.FlinkApplication{} + taskSlots := int32(4) + app2.Spec.TaskManagerConfig.TaskSlots = &taskSlots + assert.Equal(t, int32(4), getTaskmanagerSlots(&app2)) +} + +func TestGetJobManagerReplicas(t *testing.T) { + app1 := v1alpha1.FlinkApplication{} + assert.Equal(t, int32(JobManagerDefaultReplicaCount), getJobmanagerReplicas(&app1)) +} + +func TestGetJobManagerReplicasNonZero(t *testing.T) { + app1 := v1alpha1.FlinkApplication{} + replicas := int32(4) + + app1.Spec.JobManagerConfig.Replicas = &replicas + assert.Equal(t, int32(4), getJobmanagerReplicas(&app1)) +} + +func TestGetTaskManagerMemory(t *testing.T) { + app := v1alpha1.FlinkApplication{} + tmResources := coreV1.ResourceRequirements{ + Requests: coreV1.ResourceList{ + coreV1.ResourceCPU: resource.MustParse("2"), + coreV1.ResourceMemory: resource.MustParse("1Mi"), + }, + Limits: coreV1.ResourceList{ + coreV1.ResourceCPU: resource.MustParse("2"), + coreV1.ResourceMemory: resource.MustParse("1Mi"), + }, + } + expectedResource := resource.MustParse("1Mi") + expectedValue, _ := expectedResource.AsInt64() + app.Spec.TaskManagerConfig.Resources = &tmResources + assert.Equal(t, expectedValue, getTaskManagerMemory(&app)) +} + +func TestGetJobManagerMemory(t *testing.T) { + app := v1alpha1.FlinkApplication{} + tmResources := coreV1.ResourceRequirements{ + Requests: coreV1.ResourceList{ + coreV1.ResourceCPU: resource.MustParse("2"), + coreV1.ResourceMemory: resource.MustParse("1Mi"), + }, + Limits: coreV1.ResourceList{ + coreV1.ResourceCPU: resource.MustParse("2"), + coreV1.ResourceMemory: resource.MustParse("1Mi"), + }, + } + expectedResource := resource.MustParse("1Mi") + expectedValue, _ := expectedResource.AsInt64() + app.Spec.JobManagerConfig.Resources = &tmResources + assert.Equal(t, expectedValue, getJobManagerMemory(&app)) +} + +func TestGetTaskManagerHeapMemory(t *testing.T) { + app := v1alpha1.FlinkApplication{} + tmResources := coreV1.ResourceRequirements{ + Requests: coreV1.ResourceList{ + coreV1.ResourceCPU: resource.MustParse("2"), + coreV1.ResourceMemory: resource.MustParse("1Mi"), + }, + Limits: coreV1.ResourceList{ + coreV1.ResourceCPU: resource.MustParse("2"), + coreV1.ResourceMemory: resource.MustParse("1Mi"), + }, + } + offHeapMemoryFraction := float64(0.5) + app.Spec.TaskManagerConfig.Resources = &tmResources + app.Spec.TaskManagerConfig.OffHeapMemoryFraction = &offHeapMemoryFraction + + tmMemory := float64(getTaskManagerMemory(&app)) + expectedtmHeapMemoryMB := (tmMemory - tmMemory*offHeapMemoryFraction) / (1024 * 1024) + assert.Equal(t, expectedtmHeapMemoryMB, getTaskManagerHeapMemory(&app)) +} + +func TestGetJobManagerHeapMemory(t *testing.T) { + app := v1alpha1.FlinkApplication{} + jmResources := coreV1.ResourceRequirements{ + Requests: coreV1.ResourceList{ + coreV1.ResourceCPU: resource.MustParse("2"), + coreV1.ResourceMemory: resource.MustParse("1Mi"), + }, + Limits: coreV1.ResourceList{ + coreV1.ResourceCPU: resource.MustParse("2"), + coreV1.ResourceMemory: resource.MustParse("1Mi"), + }, + } + offHeapMemoryFraction := float64(0.5) + app.Spec.JobManagerConfig.Resources = &jmResources + app.Spec.JobManagerConfig.OffHeapMemoryFraction = &offHeapMemoryFraction + + jmMemory := float64(getJobManagerMemory(&app)) + expectedjmHeapMemoryMB := (jmMemory - jmMemory*offHeapMemoryFraction) / (1024 * 1024) + assert.Equal(t, expectedjmHeapMemoryMB, getJobManagerHeapMemory(&app)) +} + +func TestInvalidMemoryFraction(t *testing.T) { + app := v1alpha1.FlinkApplication{} + jmResources := coreV1.ResourceRequirements{ + Requests: coreV1.ResourceList{ + coreV1.ResourceCPU: resource.MustParse("2"), + coreV1.ResourceMemory: resource.MustParse("1Mi"), + }, + Limits: coreV1.ResourceList{ + coreV1.ResourceCPU: resource.MustParse("2"), + coreV1.ResourceMemory: resource.MustParse("1Mi"), + }, + } + offHeapMemoryFraction := float64(1.5) + app.Spec.JobManagerConfig.Resources = &jmResources + app.Spec.JobManagerConfig.OffHeapMemoryFraction = &offHeapMemoryFraction + + jmMemory := float64(getJobManagerMemory(&app)) + expectedjmHeapMemoryMB := (jmMemory - jmMemory*OffHeapMemoryDefaultFraction) / (1024 * 1024) + assert.Equal(t, expectedjmHeapMemoryMB, getJobManagerHeapMemory(&app)) + +} diff --git a/pkg/controller/flink/container_utils.go b/pkg/controller/flink/container_utils.go new file mode 100644 index 00000000..2330fc24 --- /dev/null +++ b/pkg/controller/flink/container_utils.go @@ -0,0 +1,244 @@ +package flink + +import ( + "fmt" + "hash/fnv" + + "github.com/davecgh/go-spew/spew" + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "github.com/lyft/flinkk8soperator/pkg/controller/common" + "github.com/lyft/flinkk8soperator/pkg/controller/config" + "github.com/lyft/flinkk8soperator/pkg/controller/k8" + "github.com/pkg/errors" + appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/json" +) + +const ( + AppName = "APP_NAME" + AwsMetadataServiceTimeoutKey = "AWS_METADATA_SERVICE_TIMEOUT" + AwsMetadataServiceNumAttemptsKey = "AWS_METADATA_SERVICE_NUM_ATTEMPTS" + AwsMetadataServiceTimeout = "5" + AwsMetadataServiceNumAttempts = "20" + OperatorFlinkConfig = "OPERATOR_FLINK_CONFIG" + FlinkDeploymentType = "flink-deployment-type" + FlinkDeploymentTypeJobmanager = "jobmanager" + FlinkDeploymentTypeTaskmanager = "taskmanager" + FlinkAppHash = "flink-app-hash" + FlinkJobProperties = "flink-job-properties" + RestartNonce = "restart-nonce" +) + +func getFlinkContainerName(containerName string) string { + cfg := config.GetConfig() + containerNameFormat := cfg.ContainerNameFormat + if containerNameFormat != "" { + return fmt.Sprintf(containerNameFormat, containerName) + } + return containerName +} + +func getCommonAppLabels(app *v1alpha1.FlinkApplication) map[string]string { + return k8.GetAppLabel(app.Name) +} + +func getCommonAnnotations(app *v1alpha1.FlinkApplication) map[string]string { + annotations := common.DuplicateMap(app.Annotations) + annotations[FlinkJobProperties] = fmt.Sprintf( + "jarName: %s\nparallelism: %d\nentryClass:%s\nprogramArgs:\"%s\"", + app.Spec.JarName, app.Spec.Parallelism, app.Spec.EntryClass, app.Spec.ProgramArgs) + if app.Spec.RestartNonce != "" { + annotations[RestartNonce] = app.Spec.RestartNonce + } + return annotations +} + +func GetAWSServiceEnv() []v1.EnvVar { + return []v1.EnvVar{ + { + Name: AwsMetadataServiceTimeoutKey, + Value: AwsMetadataServiceTimeout, + }, + { + Name: AwsMetadataServiceNumAttemptsKey, + Value: AwsMetadataServiceNumAttempts, + }, + } +} + +func getFlinkEnv(app *v1alpha1.FlinkApplication) ([]v1.EnvVar, error) { + env := []v1.EnvVar{} + appName := app.Name + + flinkConfig, err := renderFlinkConfig(app) + if err != nil { + return nil, errors.Wrap(err, "Failed to serialize flink configuration") + } + + env = append(env, []v1.EnvVar{ + { + Name: AppName, + Value: appName, + }, + { + Name: OperatorFlinkConfig, + Value: flinkConfig, + }, + }...) + return env, nil +} + +func GetFlinkContainerEnv(app *v1alpha1.FlinkApplication) []v1.EnvVar { + env := []v1.EnvVar{} + env = append(env, GetAWSServiceEnv()...) + flinkEnv, err := getFlinkEnv(app) + if err == nil { + env = append(env, flinkEnv...) + } + return env +} + +func ImagePullPolicy(app *v1alpha1.FlinkApplication) v1.PullPolicy { + if app.Spec.ImagePullPolicy == "" { + return v1.PullIfNotPresent + } + return app.Spec.ImagePullPolicy +} + +// Returns an 8 character hash sensitive to the application name, labels, annotations, and spec. +// TODO: we may need to add collision-avoidance to this +func HashForApplication(app *v1alpha1.FlinkApplication) string { + printer := spew.ConfigState{ + Indent: " ", + SortKeys: true, + DisableMethods: true, + DisableCapacities: true, + SpewKeys: true, + DisablePointerAddresses: true, + } + + // we round-trip through json to normalize the deployment objects + jmDeployment := jobmanagerTemplate(app) + jmDeployment.OwnerReferences = make([]metav1.OwnerReference, 0) + + // these steps should not be able to fail, so we panic instead of returning an error + jm, err := json.Marshal(jmDeployment) + if err != nil { + panic("failed to marshal deployment") + } + err = json.Unmarshal(jm, &jmDeployment) + if err != nil { + panic("failed to unmarshal deployment") + } + + tmDeployment := taskmanagerTemplate(app) + tmDeployment.OwnerReferences = make([]metav1.OwnerReference, 0) + tm, err := json.Marshal(tmDeployment) + if err != nil { + panic("failed to marshal deployment") + } + err = json.Unmarshal(tm, &tmDeployment) + if err != nil { + panic("failed to unmarshal deployment") + } + + hasher := fnv.New32a() + _, err = printer.Fprintf(hasher, "%#v%#v", jmDeployment, tmDeployment) + if err != nil { + // the hasher cannot actually throw an error on write + panic(fmt.Sprintf("got error trying when writing to hash %v", err)) + } + + return fmt.Sprintf("%08x", hasher.Sum32()) +} + +func GetAppHashSelector(app *v1alpha1.FlinkApplication) map[string]string { + return GetAppHashSelectorWithHash(HashForApplication(app)) +} + +func GetAppHashSelectorWithHash(hash string) map[string]string { + return map[string]string{ + FlinkAppHash: hash, + } +} + +func InjectHashesIntoConfig(deployment *appsv1.Deployment, app *v1alpha1.FlinkApplication, hash string) { + var newContainers []v1.Container + for _, container := range deployment.Spec.Template.Spec.Containers { + var newEnv []v1.EnvVar + for _, env := range container.Env { + if env.Name == OperatorFlinkConfig { + env.Value = fmt.Sprintf("%s\nhigh-availability.cluster-id: %s-%s\n", env.Value, app.Name, hash) + env.Value = fmt.Sprintf("%sjobmanager.rpc.address: %s\n", env.Value, VersionedJobManagerService(app, hash)) + } + newEnv = append(newEnv, env) + } + container.Env = newEnv + newContainers = append(newContainers, container) + } + deployment.Spec.Template.Spec.Containers = newContainers +} + +func envsEqual(a []v1.EnvVar, b []v1.EnvVar) bool { + if len(a) != len(b) { + return false + } + + for i := 0; i < len(a); i++ { + if a[i].Name != b[i].Name || a[i].Value != b[i].Value { + return false + } + } + return true +} + +func containersEqual(a *v1.Container, b *v1.Container) bool { + if !(a.Image == b.Image && + a.ImagePullPolicy == b.ImagePullPolicy && + apiequality.Semantic.DeepEqual(a.Args, b.Args) && + apiequality.Semantic.DeepEqual(a.Resources, b.Resources) && + envsEqual(a.Env, b.Env) && + apiequality.Semantic.DeepEqual(a.EnvFrom, b.EnvFrom) && + apiequality.Semantic.DeepEqual(a.VolumeMounts, b.VolumeMounts)) { + return false + } + + if len(a.Ports) != len(b.Ports) { + return false + } + + for i := 0; i < len(a.Ports); i++ { + if a.Ports[i].Name != b.Ports[i].Name || + a.Ports[i].ContainerPort != b.Ports[i].ContainerPort { + return false + } + } + + return true +} + +// Returns true if there are no relevant differences between the deployments. This should be used only to determine +// that two deployments correspond to the same FlinkApplication, not as a general notion of equality. +func DeploymentsEqual(a *appsv1.Deployment, b *appsv1.Deployment) bool { + if !apiequality.Semantic.DeepEqual(a.Spec.Template.Spec.Volumes, b.Spec.Template.Spec.Volumes) { + return false + } + if len(a.Spec.Template.Spec.Containers) == 0 || + len(b.Spec.Template.Spec.Containers) == 0 || + !containersEqual(&a.Spec.Template.Spec.Containers[0], &b.Spec.Template.Spec.Containers[0]) { + return false + } + if *a.Spec.Replicas != *b.Spec.Replicas { + return false + } + if a.Annotations[FlinkJobProperties] != b.Annotations[FlinkJobProperties] { + return false + } + if a.Annotations[RestartNonce] != b.Annotations[RestartNonce] { + return false + } + return true +} diff --git a/pkg/controller/flink/container_utils_test.go b/pkg/controller/flink/container_utils_test.go new file mode 100644 index 00000000..946efcd3 --- /dev/null +++ b/pkg/controller/flink/container_utils_test.go @@ -0,0 +1,103 @@ +package flink + +import ( + "testing" + + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" +) + +func TestHashForApplication(t *testing.T) { + app := v1alpha1.FlinkApplication{} + taskSlots := int32(8) + app.Spec.TaskManagerConfig.TaskSlots = &taskSlots + app.Spec.Parallelism = 4 + app.Name = "app-name" + app.Namespace = "ns" + app.Spec.Image = "abcdef" + app.ObjectMeta.Labels = map[string]string{ + "label-k": "label-v", + } + app.ObjectMeta.Annotations = map[string]string{ + "annotation-k": "annotation-v", + } + + h1 := HashForApplication(&app) + assert.Equal(t, 8, len(h1)) + + app.Name = "another-name" + h2 := HashForApplication(&app) + assert.NotEqual(t, h1, h2) + + app.Spec.Image = "zxy" + h3 := HashForApplication(&app) + assert.NotEqual(t, h2, h3) + + app.Labels["label-k"] = "new-v" + h4 := HashForApplication(&app) + assert.NotEqual(t, h3, h4) + + app.Annotations["annotation-k"] = "new-v" + h5 := HashForApplication(&app) + assert.NotEqual(t, h4, h5) + + app.Spec.Parallelism = 7 + h6 := HashForApplication(&app) + assert.NotEqual(t, h5, h6) +} + +func TestHashForDifferentResourceScales(t *testing.T) { + app1 := v1alpha1.FlinkApplication{} + app1.Spec.TaskManagerConfig.Resources = &v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("0.5"), + v1.ResourceMemory: resource.MustParse("1024Mi"), + }, + Limits: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("0.5"), + v1.ResourceMemory: resource.MustParse("1024Mi"), + }, + } + + app2 := v1alpha1.FlinkApplication{} + app2.Spec.TaskManagerConfig.Resources = &v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("500m"), + v1.ResourceMemory: resource.MustParse("1024Mi"), + }, + Limits: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("500m"), + v1.ResourceMemory: resource.MustParse("1024Mi"), + }, + } + + assert.Equal(t, HashForApplication(&app1), HashForApplication(&app2)) +} + +func TestContainersEqual(t *testing.T) { + app := getFlinkTestApp() + d1 := FetchJobMangerDeploymentCreateObj(&app, "hash") + d2 := FetchJobMangerDeploymentCreateObj(&app, "hash") + + assert.True(t, DeploymentsEqual(d1, d2)) + + d1 = FetchTaskMangerDeploymentCreateObj(&app, HashForApplication(&app)) + d2 = FetchTaskMangerDeploymentCreateObj(&app, HashForApplication(&app)) + + assert.True(t, DeploymentsEqual(d1, d2)) + + d3 := d1.DeepCopy() + d3.Spec.Template.Spec.Containers[0].ImagePullPolicy = "Always" + assert.False(t, DeploymentsEqual(d3, d2)) + + d3 = d1.DeepCopy() + replicas := int32(13) + d3.Spec.Replicas = &replicas + assert.False(t, DeploymentsEqual(d3, d2)) + + d3 = d1.DeepCopy() + d3.Annotations[RestartNonce] = "x" + assert.False(t, DeploymentsEqual(d3, d2)) +} diff --git a/pkg/controller/flink/flink.go b/pkg/controller/flink/flink.go new file mode 100644 index 00000000..55c906c1 --- /dev/null +++ b/pkg/controller/flink/flink.go @@ -0,0 +1,553 @@ +package flink + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "time" + + "github.com/lyft/flinkk8soperator/pkg/controller/common" + + "github.com/lyft/flinkk8soperator/pkg/controller/config" + "github.com/lyft/flytestdlib/logger" + + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "github.com/lyft/flinkk8soperator/pkg/controller/flink/client" + "github.com/lyft/flinkk8soperator/pkg/controller/k8" + "github.com/lyft/flytestdlib/promutils" + "github.com/lyft/flytestdlib/promutils/labeled" + v1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const proxyURL = "http://localhost:%d/api/v1/namespaces/%s/services/%s:8081/proxy" +const port = 8081 + +// Maximum age of an externalized checkpoint that we will attempt to restore +const maxRestoreCheckpointAge = 24 * time.Hour + +// If the last hearbeat from a taskmanager was more than taskManagerHeartbeatThreshold, the task +// manager is considered unhealthy. +const taskManagerHeartbeatThreshold = 2 * time.Minute + +// Maximum allowable number of checkpoint failures before job health status is Red +const maxCheckpointTime = 10 * time.Minute + +// If the job has been in and out of a FAILING state within failingIntervalThreshold, we consider +// the JobStatus.Health to be "Red" +const failingIntervalThreshold = 1 * time.Minute + +// Interface to manage Flink Application in Kubernetes +type ControllerInterface interface { + // Creates a Flink cluster with necessary Job Manager, Task Managers and services for UI + CreateCluster(ctx context.Context, application *v1alpha1.FlinkApplication) error + + // Deletes a Flink cluster based on the hash + DeleteCluster(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) error + + // Cancels the running/active jobs in the Cluster for the Application after savepoint is created + CancelWithSavepoint(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (string, error) + + // Force cancels the running/active job without taking a savepoint + ForceCancel(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) error + + // Starts the Job in the Flink Cluster + StartFlinkJob(ctx context.Context, application *v1alpha1.FlinkApplication, hash string, + jarName string, parallelism int32, entryClass string, programArgs string) (string, error) + + // Savepoint creation is asynchronous. + // Polls the status of the Savepoint, using the triggerID + GetSavepointStatus(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (*client.SavepointResponse, error) + + // Check if the Flink Kubernetes Cluster is Ready. + // Checks if all the pods of task and job managers are ready. + IsClusterReady(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) + + // Checks to see if the Flink Cluster is ready to handle API requests + IsServiceReady(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (bool, error) + + // Returns the list of Jobs running on the Flink Cluster for the Application + GetJobsForApplication(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) ([]client.FlinkJob, error) + + // For the application, a deployment corresponds to an image. This returns the current and older deployments for the app. + GetCurrentAndOldDeploymentsForApp(ctx context.Context, application *v1alpha1.FlinkApplication) (*common.FlinkDeployment, []common.FlinkDeployment, error) + + // Attempts to find an externalized checkpoint for the job. This can be used to recover an application that is not + // able to savepoint for some reason. + FindExternalizedCheckpoint(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (string, error) + + // Logs an event to the FlinkApplication resource and to the operator log + LogEvent(ctx context.Context, app *v1alpha1.FlinkApplication, fieldPath string, eventType string, message string) + + // Compares and updates new cluster status with current cluster status + // Returns true if there is a change in ClusterStatus + CompareAndUpdateClusterStatus(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (bool, error) + + // Compares and updates new job status with current job status + // Returns true if there is a change in JobStatus + CompareAndUpdateJobStatus(ctx context.Context, app *v1alpha1.FlinkApplication, hash string) (bool, error) +} + +func NewController(k8sCluster k8.ClusterInterface, config config.RuntimeConfig) ControllerInterface { + metrics := newControllerMetrics(config.MetricsScope) + return &Controller{ + k8Cluster: k8sCluster, + jobManager: NewJobManagerController(k8sCluster, config), + taskManager: NewTaskManagerController(k8sCluster, config), + flinkClient: client.NewFlinkJobManagerClient(config), + metrics: metrics, + } +} + +func newControllerMetrics(scope promutils.Scope) *controllerMetrics { + flinkControllerScope := scope.NewSubScope("flink_controller") + return &controllerMetrics{ + scope: scope, + deleteClusterSuccessCounter: labeled.NewCounter("delete_cluster_success", "Flink cluster deleted successfully", flinkControllerScope), + deleteClusterFailedCounter: labeled.NewCounter("delete_cluster_failure", "Flink cluster deletion failed", flinkControllerScope), + applicationChangedCounter: labeled.NewCounter("app_changed_counter", "Flink application has changed", flinkControllerScope), + } +} + +type controllerMetrics struct { + scope promutils.Scope + deleteClusterSuccessCounter labeled.Counter + deleteClusterFailedCounter labeled.Counter + applicationChangedCounter labeled.Counter +} + +type Controller struct { + k8Cluster k8.ClusterInterface + jobManager JobManagerControllerInterface + taskManager TaskManagerControllerInterface + flinkClient client.FlinkAPIInterface + metrics *controllerMetrics +} + +func getURLFromApp(application *v1alpha1.FlinkApplication, hash string) string { + service := VersionedJobManagerService(application, hash) + cfg := config.GetConfig() + if cfg.UseProxy { + return fmt.Sprintf(proxyURL, cfg.ProxyPort.Port, application.Namespace, service) + } + return fmt.Sprintf("http://%s.%s:%d", service, application.Namespace, port) +} + +func GetActiveFlinkJob(jobs []client.FlinkJob) *client.FlinkJob { + if len(jobs) == 0 { + return nil + } + for _, job := range jobs { + if job.Status == client.Running || + job.Status == client.Created || + job.Status == client.Finished { + return &job + } + } + return nil +} + +// returns true iff the deployment exactly matches the flink application +func (f *Controller) deploymentMatches(ctx context.Context, deployment *v1.Deployment, application *v1alpha1.FlinkApplication) bool { + if DeploymentIsTaskmanager(deployment) { + return TaskManagerDeploymentMatches(deployment, application) + } + if DeploymentIsJobmanager(deployment) { + return JobManagerDeploymentMatches(deployment, application) + } + + logger.Warnf(ctx, "Found deployment that is not a TaskManager or JobManager: %s", deployment.Name) + return false +} + +func (f *Controller) GetJobsForApplication(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) ([]client.FlinkJob, error) { + jobResponse, err := f.flinkClient.GetJobs(ctx, getURLFromApp(application, hash)) + if err != nil { + return nil, err + } + + return jobResponse.Jobs, nil +} + +// The operator for now assumes and is intended to run single application per Flink Cluster. +// Once we move to run multiple applications, this has to be removed/updated +func (f *Controller) getJobIDForApplication(application *v1alpha1.FlinkApplication) (string, error) { + if application.Status.JobStatus.JobID != "" { + return application.Status.JobStatus.JobID, nil + } + + return "", errors.New("active job id not available") +} + +func (f *Controller) CancelWithSavepoint(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (string, error) { + jobID, err := f.getJobIDForApplication(application) + if err != nil { + return "", err + } + return f.flinkClient.CancelJobWithSavepoint(ctx, getURLFromApp(application, hash), jobID) +} + +func (f *Controller) ForceCancel(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) error { + jobID, err := f.getJobIDForApplication(application) + if err != nil { + return err + } + return f.flinkClient.ForceCancelJob(ctx, getURLFromApp(application, hash), jobID) +} + +func (f *Controller) CreateCluster(ctx context.Context, application *v1alpha1.FlinkApplication) error { + newlyCreatedJm, err := f.jobManager.CreateIfNotExist(ctx, application) + if err != nil { + logger.Errorf(ctx, "Job manager cluster creation did not succeed %v", err) + f.LogEvent(ctx, application, "", corev1.EventTypeWarning, + fmt.Sprintf("Failed to create job managers: %v", err)) + + return err + } + newlyCreatedTm, err := f.taskManager.CreateIfNotExist(ctx, application) + if err != nil { + logger.Errorf(ctx, "Task manager cluster creation did not succeed %v", err) + f.LogEvent(ctx, application, "", corev1.EventTypeWarning, + fmt.Sprintf("Failed to create task managers: %v", err)) + return err + } + + if newlyCreatedJm || newlyCreatedTm { + f.LogEvent(ctx, application, "", corev1.EventTypeNormal, "Flink cluster created") + } + return nil +} + +func (f *Controller) StartFlinkJob(ctx context.Context, application *v1alpha1.FlinkApplication, hash string, + jarName string, parallelism int32, entryClass string, programArgs string) (string, error) { + response, err := f.flinkClient.SubmitJob( + ctx, + getURLFromApp(application, hash), + jarName, + client.SubmitJobRequest{ + Parallelism: parallelism, + SavepointPath: application.Spec.SavepointInfo.SavepointLocation, + EntryClass: entryClass, + ProgramArgs: programArgs, + }) + if err != nil { + return "", err + } + if response.JobID == "" { + logger.Errorf(ctx, "Job id in the submit job response was empty") + return "", errors.New("unable to submit job: invalid job id") + } + return response.JobID, nil +} + +func (f *Controller) GetSavepointStatus(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (*client.SavepointResponse, error) { + jobID, err := f.getJobIDForApplication(application) + if err != nil { + return nil, err + } + return f.flinkClient.CheckSavepointStatus(ctx, getURLFromApp(application, hash), jobID, application.Spec.SavepointInfo.TriggerID) +} + +func (f *Controller) DeleteCluster(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) error { + if hash == "" { + return errors.New("invalid hash: must not be empty") + } + + jmDeployment := FetchJobMangerDeploymentDeleteObj(application, hash) + err := f.k8Cluster.DeleteK8Object(ctx, jmDeployment) + if err != nil { + f.metrics.deleteClusterFailedCounter.Inc(ctx) + logger.Warnf(ctx, "Failed to delete jobmanager deployment") + return err + } + + tmDeployment := FetchTaskMangerDeploymentDeleteObj(application, hash) + err = f.k8Cluster.DeleteK8Object(ctx, tmDeployment) + if err != nil { + f.metrics.deleteClusterFailedCounter.Inc(ctx) + logger.Warnf(ctx, "Failed to delete taskmanager deployment") + return err + } + + versionedJobService := FetchVersionedJobManagerServiceDeleteObj(application, hash) + err = f.k8Cluster.DeleteK8Object(ctx, versionedJobService) + if err != nil { + f.metrics.deleteClusterFailedCounter.Inc(ctx) + logger.Warnf(ctx, "Failed to delete versioned service") + return err + } + + f.metrics.deleteClusterSuccessCounter.Inc(ctx) + return nil +} + +func (f *Controller) IsClusterReady(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) { + labelMap := GetAppHashSelector(application) + + deploymentList, err := f.k8Cluster.GetDeploymentsWithLabel(ctx, application.Namespace, labelMap) + if err != nil { + logger.Warnf(ctx, "Failed to get deployments for label map %v", labelMap) + return false, err + } + if deploymentList == nil || len(deploymentList.Items) == 0 { + logger.Infof(ctx, "No deployments present for label map %v", labelMap) + return false, nil + } + + // TODO: Find if any events can be populated, that are useful to users + for _, deployment := range deploymentList.Items { + // For Jobmanager we only need on replica to be available + if deployment.Labels[FlinkDeploymentType] == FlinkDeploymentTypeJobmanager { + if deployment.Status.AvailableReplicas == 0 { + return false, nil + } + } else { + if deployment.Spec.Replicas != nil && + deployment.Status.AvailableReplicas < *deployment.Spec.Replicas { + return false, nil + } + } + } + return true, nil +} + +func (f *Controller) IsServiceReady(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (bool, error) { + _, err := f.flinkClient.GetClusterOverview(ctx, getURLFromApp(application, hash)) + if err != nil { + logger.Infof(ctx, "Error response indicating flink API is not ready to handle request %v", err) + return false, err + } + return true, nil +} + +func listToFlinkDeployment(ds []v1.Deployment, hash string) *common.FlinkDeployment { + if len(ds) != 2 { + return nil + } + + fd := common.FlinkDeployment{ + Hash: hash, + } + + l0 := ds[0].Labels[FlinkDeploymentType] + l1 := ds[1].Labels[FlinkDeploymentType] + + if l0 == FlinkDeploymentTypeJobmanager && l1 == FlinkDeploymentTypeTaskmanager { + fd.Jobmanager = &ds[0] + fd.Taskmanager = &ds[1] + } else if l0 == FlinkDeploymentTypeTaskmanager && l1 == FlinkDeploymentTypeJobmanager { + fd.Jobmanager = &ds[1] + fd.Taskmanager = &ds[0] + } else { + return nil + } + + return &fd +} + +// Gets the current deployment and any other deployments for the application. The current deployment will be the one +// that matches the FlinkApplication, unless the FailedDeployHash is set, in which case it will be the one with that +// hash. +func (f *Controller) GetCurrentAndOldDeploymentsForApp(ctx context.Context, + application *v1alpha1.FlinkApplication) (*common.FlinkDeployment, []common.FlinkDeployment, error) { + appLabels := k8.GetAppLabel(application.Name) + deployments, err := f.k8Cluster.GetDeploymentsWithLabel(ctx, application.Namespace, appLabels) + if err != nil { + return nil, nil, err + } + + byHash := map[string][]v1.Deployment{} + for _, deployment := range deployments.Items { + byHash[deployment.Labels[FlinkAppHash]] = append(byHash[deployment.Labels[FlinkAppHash]], deployment) + } + + appHash := HashForApplication(application) + var curHash string + + if appHash == application.Status.FailedDeployHash { + curHash = application.Status.DeployHash + } else { + curHash = appHash + } + + cur := listToFlinkDeployment(byHash[curHash], curHash) + if cur != nil && application.Status.FailedDeployHash == "" && + (!f.deploymentMatches(ctx, cur.Jobmanager, application) || !f.deploymentMatches(ctx, cur.Taskmanager, application)) { + // we had a hash collision (i.e., the previous application has the same hash as the new one) + // this is *very* unlikely to occur (1/2^32) + return nil, nil, errors.New("found hash collision for deployment, you must do a clean deploy") + } + + old := make([]common.FlinkDeployment, 0) + for hash, ds := range byHash { + if hash != curHash { + fd := listToFlinkDeployment(ds, hash) + if fd != nil { + old = append(old, *fd) + } else { + logger.Warn(ctx, "Found deployments that do not have one JM and TM: %v", ds) + } + } + } + + return cur, old, nil +} + +func (f *Controller) FindExternalizedCheckpoint(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (string, error) { + checkpoint, err := f.flinkClient.GetLatestCheckpoint(ctx, getURLFromApp(application, hash), application.Status.JobStatus.JobID) + if err != nil { + return "", err + } + if checkpoint == nil { + return "", nil + } + + if time.Since(time.Unix(checkpoint.TriggerTimestamp, 0)) > maxRestoreCheckpointAge { + logger.Info(ctx, "Found checkpoint to restore from, but was too old") + return "", nil + } + + return checkpoint.ExternalPath, nil +} + +func (f *Controller) LogEvent(ctx context.Context, app *v1alpha1.FlinkApplication, fieldPath string, eventType string, message string) { + reason := "Create" + if app.Status.DeployHash != "" { + // this is not the first deploy + reason = "Update" + } + if app.DeletionTimestamp != nil { + reason = "Delete" + } + + event := k8.CreateEvent(app, fieldPath, eventType, reason, message) + logger.Infof(ctx, "Logged %s event: %s: %s", eventType, reason, message) + + // TODO: switch to using EventRecorder once we switch to controller runtime + if err := f.k8Cluster.CreateK8Object(ctx, &event); err != nil { + b, _ := json.Marshal(event) + logger.Errorf(ctx, "Failed to log event %v: %v", string(b), err) + } +} + +// Gets and updates the cluster status +func (f *Controller) CompareAndUpdateClusterStatus(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (bool, error) { + oldClusterStatus := application.Status.ClusterStatus + clusterErrors := "" + // Get Cluster overview + response, err := f.flinkClient.GetClusterOverview(ctx, getURLFromApp(application, hash)) + + if err != nil { + clusterErrors = err.Error() + } else { + // Update cluster overview + application.Status.ClusterStatus.NumberOfTaskManagers = response.TaskManagerCount + application.Status.ClusterStatus.AvailableTaskSlots = response.SlotsAvailable + application.Status.ClusterStatus.NumberOfTaskSlots = response.NumberOfTaskSlots + } + + // Get Healthy Taskmanagers + tmResponse, tmErr := f.flinkClient.GetTaskManagers(ctx, getURLFromApp(application, hash)) + if tmErr != nil { + clusterErrors += tmErr.Error() + } else { + application.Status.ClusterStatus.HealthyTaskManagers = getHealthyTaskManagerCount(tmResponse) + } + // Determine Health of the cluster. + // Error retrieving cluster / taskmanagers overview (after startup/readiness) --> Red + // Healthy TaskManagers == Number of taskmanagers --> Green + // Else --> Yellow + if clusterErrors != "" && (application.Status.Phase != v1alpha1.FlinkApplicationClusterStarting && + application.Status.Phase != v1alpha1.FlinkApplicationSubmittingJob) { + application.Status.ClusterStatus.Health = v1alpha1.Red + return false, errors.New(clusterErrors) + } else if application.Status.ClusterStatus.HealthyTaskManagers == application.Status.ClusterStatus.NumberOfTaskManagers { + application.Status.ClusterStatus.Health = v1alpha1.Green + } else { + application.Status.ClusterStatus.Health = v1alpha1.Yellow + } + + return !apiequality.Semantic.DeepEqual(oldClusterStatus, application.Status.ClusterStatus), nil +} + +func getHealthyTaskManagerCount(response *client.TaskManagersResponse) int32 { + healthyTMCount := 0 + for index := range response.TaskManagers { + // A taskmanager is considered healthy if its last heartbeat was within taskManagerHeartbeatThreshold + if time.Since(time.Unix(response.TaskManagers[index].TimeSinceLastHeartbeat/1000, 0)) <= taskManagerHeartbeatThreshold { + healthyTMCount++ + } + } + + return int32(healthyTMCount) + +} + +func (f *Controller) CompareAndUpdateJobStatus(ctx context.Context, app *v1alpha1.FlinkApplication, hash string) (bool, error) { + // Initialize the last failing time to beginning of time if it's never been set + if app.Status.JobStatus.LastFailingTime == nil { + initTime := metav1.NewTime(time.Time{}) + app.Status.JobStatus.LastFailingTime = &initTime + } + + oldJobStatus := app.Status.JobStatus + + app.Status.JobStatus.JobID = oldJobStatus.JobID + jobResponse, err := f.flinkClient.GetJobOverview(ctx, getURLFromApp(app, hash), app.Status.JobStatus.JobID) + if err != nil { + return false, err + } + checkpoints, err := f.flinkClient.GetCheckpointCounts(ctx, getURLFromApp(app, hash), app.Status.JobStatus.JobID) + if err != nil { + return false, err + } + + // Job status + app.Status.JobStatus.State = v1alpha1.JobState(jobResponse.State) + jobStartTime := metav1.NewTime(time.Unix(jobResponse.StartTime/1000, 0)) + app.Status.JobStatus.StartTime = &jobStartTime + + // Checkpoints status + app.Status.JobStatus.FailedCheckpointCount = checkpoints.Counts["failed"] + app.Status.JobStatus.CompletedCheckpointCount = checkpoints.Counts["completed"] + app.Status.JobStatus.JobRestartCount = checkpoints.Counts["restored"] + + latestCheckpoint := checkpoints.Latest.Completed + var lastCheckpointAgeSeconds int + if latestCheckpoint != nil { + lastCheckpointTimeMillis := metav1.NewTime(time.Unix(latestCheckpoint.LatestAckTimestamp/1000, 0)) + app.Status.JobStatus.LastCheckpointTime = &lastCheckpointTimeMillis + lastCheckpointAgeSeconds = app.Status.JobStatus.LastCheckpointTime.Second() + } + + if checkpoints.Latest.Restored != nil { + app.Status.JobStatus.RestorePath = checkpoints.Latest.Restored.ExternalPath + restoreTime := metav1.NewTime(time.Unix(checkpoints.Latest.Restored.RestoredTimeStamp/1000, 0)) + app.Status.JobStatus.RestoreTime = &restoreTime + + } + + // Health Status for job + // Job is in FAILING state --> RED + // Time since last successful checkpoint > maxCheckpointTime --> YELLOW + // Else --> Green + + if app.Status.JobStatus.State == v1alpha1.Failing || time.Since(app.Status.JobStatus.LastFailingTime.Time) < + failingIntervalThreshold { + app.Status.JobStatus.Health = v1alpha1.Red + } else if time.Since(time.Unix(int64(lastCheckpointAgeSeconds), 0)) < maxCheckpointTime { + app.Status.JobStatus.Health = v1alpha1.Yellow + } else { + app.Status.JobStatus.Health = v1alpha1.Green + } + // Update LastFailingTime + if app.Status.JobStatus.State == v1alpha1.Failing { + currTime := metav1.Now() + app.Status.JobStatus.LastFailingTime = &currTime + } + + return !apiequality.Semantic.DeepEqual(oldJobStatus, app.Status.JobStatus), err +} diff --git a/pkg/controller/flink/flink_test.go b/pkg/controller/flink/flink_test.go new file mode 100644 index 00000000..7d88f04f --- /dev/null +++ b/pkg/controller/flink/flink_test.go @@ -0,0 +1,835 @@ +package flink + +import ( + "context" + "testing" + + "time" + + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "github.com/lyft/flinkk8soperator/pkg/controller/common" + "github.com/lyft/flinkk8soperator/pkg/controller/flink/client" + clientMock "github.com/lyft/flinkk8soperator/pkg/controller/flink/client/mock" + "github.com/lyft/flinkk8soperator/pkg/controller/flink/mock" + k8mock "github.com/lyft/flinkk8soperator/pkg/controller/k8/mock" + mockScope "github.com/lyft/flytestdlib/promutils" + "github.com/lyft/flytestdlib/promutils/labeled" + "github.com/pkg/errors" + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/apps/v1" + metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "k8s.io/apimachinery/pkg/runtime" +) + +const testImage = "123.xyz.com/xx:11ae1218924428faabd9b64423fa0c332efba6b2" + +// Note: if you find yourself changing this to fix a test, that should be treated as a breaking API change +const testAppHash = "718222d3" +const testAppName = "app-name" +const testNamespace = "ns" +const testJobID = "j1" +const testFlinkVersion = "1.7" + +func getTestFlinkController() Controller { + testScope := mockScope.NewTestScope() + labeled.SetMetricKeys(common.GetValidLabelNames()...) + return Controller{ + jobManager: &mock.JobManagerController{}, + taskManager: &mock.TaskManagerController{}, + k8Cluster: &k8mock.K8Cluster{}, + flinkClient: &clientMock.JobManagerClient{}, + metrics: newControllerMetrics(testScope), + } +} + +func getFlinkTestApp() v1alpha1.FlinkApplication { + app := v1alpha1.FlinkApplication{ + TypeMeta: metaV1.TypeMeta{ + Kind: v1alpha1.FlinkApplicationKind, + APIVersion: v1alpha1.SchemeGroupVersion.String(), + }, + } + app.Spec.Parallelism = 8 + app.Name = testAppName + app.Namespace = testNamespace + app.Status.JobStatus.JobID = testJobID + app.Spec.Image = testImage + app.Spec.FlinkVersion = testFlinkVersion + + return app +} + +func TestFlinkIsClusterReady(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + labelMapVal := map[string]string{ + "flink-app-hash": testAppHash, + } + flinkApp := getFlinkTestApp() + + mockK8Cluster := flinkControllerForTest.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.GetDeploymentsWithLabelFunc = func(ctx context.Context, namespace string, labelMap map[string]string) (*v1.DeploymentList, error) { + assert.Equal(t, testNamespace, namespace) + assert.Equal(t, labelMapVal, labelMap) + jmDeployment := FetchTaskMangerDeploymentCreateObj(&flinkApp, testAppHash) + jmDeployment.Status.AvailableReplicas = 1 + + tmDeployment := FetchJobMangerDeploymentCreateObj(&flinkApp, testAppHash) + tmDeployment.Status.AvailableReplicas = *tmDeployment.Spec.Replicas + return &v1.DeploymentList{ + Items: []v1.Deployment{ + *jmDeployment, + *tmDeployment, + }, + }, nil + } + + result, err := flinkControllerForTest.IsClusterReady( + context.Background(), &flinkApp, + ) + assert.True(t, result) + assert.Nil(t, err) +} + +func TestFlinkApplicationChangedReplicas(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + labelMapVal := map[string]string{ + "flink-app": testAppName, + } + + flinkApp := getFlinkTestApp() + taskSlots := int32(16) + flinkApp.Spec.TaskManagerConfig.TaskSlots = &taskSlots + flinkApp.Spec.Parallelism = 8 + + mockK8Cluster := flinkControllerForTest.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.GetDeploymentsWithLabelFunc = func(ctx context.Context, namespace string, labelMap map[string]string) (*v1.DeploymentList, error) { + assert.Equal(t, testNamespace, namespace) + assert.Equal(t, labelMapVal, labelMap) + + newApp := flinkApp.DeepCopy() + newApp.Spec.Parallelism = 10 + hash := HashForApplication(newApp) + tm := *FetchTaskMangerDeploymentCreateObj(newApp, hash) + jm := *FetchJobMangerDeploymentCreateObj(newApp, hash) + + return &v1.DeploymentList{ + Items: []v1.Deployment{tm, jm}, + }, nil + } + + cur, _, err := flinkControllerForTest.GetCurrentAndOldDeploymentsForApp( + context.Background(), &flinkApp, + ) + assert.True(t, cur == nil) + assert.Nil(t, err) +} + +func TestFlinkApplicationNotChanged(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + labelMapVal := map[string]string{ + "flink-app": testAppName, + } + flinkApp := getFlinkTestApp() + mockK8Cluster := flinkControllerForTest.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.GetDeploymentsWithLabelFunc = func(ctx context.Context, namespace string, labelMap map[string]string) (*v1.DeploymentList, error) { + assert.Equal(t, testNamespace, namespace) + assert.Equal(t, labelMapVal, labelMap) + return &v1.DeploymentList{ + Items: []v1.Deployment{ + *FetchTaskMangerDeploymentCreateObj(&flinkApp, testAppHash), + *FetchJobMangerDeploymentCreateObj(&flinkApp, testAppHash), + }, + }, nil + } + cur, _, err := flinkControllerForTest.GetCurrentAndOldDeploymentsForApp( + context.Background(), &flinkApp, + ) + assert.Nil(t, err) + assert.False(t, cur == nil) +} + +func TestFlinkApplicationChanged(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + labelMapVal := map[string]string{ + "flink-app": testAppName, + } + mockK8Cluster := flinkControllerForTest.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.GetDeploymentsWithLabelFunc = func(ctx context.Context, namespace string, labelMap map[string]string) (*v1.DeploymentList, error) { + assert.Equal(t, testNamespace, namespace) + assert.Equal(t, labelMapVal, labelMap) + return &v1.DeploymentList{}, nil + } + flinkApp := getFlinkTestApp() + cur, _, err := flinkControllerForTest.GetCurrentAndOldDeploymentsForApp( + context.Background(), &flinkApp, + ) + assert.True(t, cur == nil) + assert.Nil(t, err) +} + +func testJobPropTriggersChange(t *testing.T, changeFun func(application *v1alpha1.FlinkApplication)) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + + mockK8Cluster := flinkControllerForTest.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.GetDeploymentsWithLabelFunc = func(ctx context.Context, namespace string, labelMap map[string]string) (*v1.DeploymentList, error) { + assert.Equal(t, testNamespace, namespace) + if val, ok := labelMap["flink-app-hash"]; ok { + assert.Equal(t, testAppHash, val) + } + if val, ok := labelMap["flink-app"]; ok { + assert.Equal(t, testAppName, val) + } + hash := HashForApplication(&flinkApp) + tm := FetchTaskMangerDeploymentCreateObj(&flinkApp, hash) + jm := FetchJobMangerDeploymentCreateObj(&flinkApp, hash) + return &v1.DeploymentList{ + Items: []v1.Deployment{ + *tm, *jm, + }, + }, nil + } + + newApp := flinkApp.DeepCopy() + changeFun(newApp) + cur, _, err := flinkControllerForTest.GetCurrentAndOldDeploymentsForApp( + context.Background(), newApp, + ) + assert.True(t, cur == nil) + assert.Nil(t, err) +} + +func TestFlinkApplicationChangedJobProps(t *testing.T) { + testJobPropTriggersChange(t, func(app *v1alpha1.FlinkApplication) { + app.Spec.Parallelism = 3 + }) + + testJobPropTriggersChange(t, func(app *v1alpha1.FlinkApplication) { + app.Spec.JarName = "another.jar" + }) + + testJobPropTriggersChange(t, func(app *v1alpha1.FlinkApplication) { + app.Spec.ProgramArgs = "--test-change" + }) + + testJobPropTriggersChange(t, func(app *v1alpha1.FlinkApplication) { + app.Spec.EntryClass = "com.another.Class" + }) +} + +func TestFlinkApplicationNeedsUpdate(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + + mockK8Cluster := flinkControllerForTest.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.GetDeploymentsWithLabelFunc = func(ctx context.Context, namespace string, labelMap map[string]string) (*v1.DeploymentList, error) { + assert.Equal(t, testNamespace, namespace) + if val, ok := labelMap["flink-app-hash"]; ok { + assert.Equal(t, testAppHash, val) + } + if val, ok := labelMap["flink-app"]; ok { + assert.Equal(t, testAppName, val) + } + + app := getFlinkTestApp() + jm := FetchJobMangerDeploymentCreateObj(&app, testAppHash) + tm := FetchTaskMangerDeploymentCreateObj(&app, testAppHash) + + return &v1.DeploymentList{ + Items: []v1.Deployment{ + *jm, *tm, + }, + }, nil + } + + numberOfTaskManagers := int32(2) + taskSlots := int32(2) + flinkApp.Spec.TaskManagerConfig.TaskSlots = &taskSlots + flinkApp.Spec.Parallelism = taskSlots*numberOfTaskManagers + 1 + cur, _, err := flinkControllerForTest.GetCurrentAndOldDeploymentsForApp( + context.Background(), &flinkApp, + ) + assert.True(t, cur == nil) + assert.Nil(t, err) +} + +func TestFlinkIsServiceReady(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + mockJmClient.GetClusterOverviewFunc = func(ctx context.Context, url string) (*client.ClusterOverviewResponse, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + return &client.ClusterOverviewResponse{ + TaskManagerCount: 3, + }, nil + } + isReady, err := flinkControllerForTest.IsServiceReady(context.Background(), &flinkApp, "hash") + assert.Nil(t, err) + assert.True(t, isReady) +} + +func TestFlinkIsServiceReadyErr(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + mockJmClient.GetClusterOverviewFunc = func(ctx context.Context, url string) (*client.ClusterOverviewResponse, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + return nil, errors.New("Get cluster failed") + } + isReady, err := flinkControllerForTest.IsServiceReady(context.Background(), &flinkApp, "hash") + assert.EqualError(t, err, "Get cluster failed") + assert.False(t, isReady) +} + +func TestFlinkGetSavepointStatus(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + flinkApp.Spec.SavepointInfo.TriggerID = "t1" + + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + mockJmClient.CheckSavepointStatusFunc = func(ctx context.Context, url string, jobID, triggerID string) (*client.SavepointResponse, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + assert.Equal(t, jobID, testJobID) + assert.Equal(t, triggerID, "t1") + return &client.SavepointResponse{ + SavepointStatus: client.SavepointStatusResponse{ + Status: client.SavePointInProgress, + }, + }, nil + } + status, err := flinkControllerForTest.GetSavepointStatus(context.Background(), &flinkApp, "hash") + assert.Nil(t, err) + assert.NotNil(t, status) + + assert.Equal(t, client.SavePointInProgress, status.SavepointStatus.Status) +} + +func TestFlinkGetSavepointStatusErr(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + mockJmClient.CheckSavepointStatusFunc = func(ctx context.Context, url string, jobID, triggerID string) (*client.SavepointResponse, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + assert.Equal(t, jobID, testJobID) + return nil, errors.New("Savepoint error") + } + status, err := flinkControllerForTest.GetSavepointStatus(context.Background(), &flinkApp, "hash") + assert.Nil(t, status) + assert.NotNil(t, err) + + assert.EqualError(t, err, "Savepoint error") +} + +func TestGetActiveJob(t *testing.T) { + job := client.FlinkJob{ + Status: client.Running, + JobID: "j1", + } + jobs := []client.FlinkJob{ + job, + } + activeJob := GetActiveFlinkJob(jobs) + assert.NotNil(t, activeJob) + assert.Equal(t, *activeJob, job) +} + +func TestGetActiveJobFinished(t *testing.T) { + job := client.FlinkJob{ + Status: client.Finished, + JobID: "j1", + } + jobs := []client.FlinkJob{ + job, + } + activeJob := GetActiveFlinkJob(jobs) + assert.NotNil(t, activeJob) + assert.Equal(t, *activeJob, job) +} + +func TestGetActiveJobNil(t *testing.T) { + job := client.FlinkJob{ + Status: client.Cancelling, + JobID: "j1", + } + jobs := []client.FlinkJob{ + job, + } + activeJob := GetActiveFlinkJob(jobs) + assert.Nil(t, activeJob) +} + +func TestGetActiveJobEmpty(t *testing.T) { + jobs := []client.FlinkJob{} + activeJob := GetActiveFlinkJob(jobs) + assert.Nil(t, activeJob) +} + +func TestDeleteCluster(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + jmDeployment := FetchJobMangerDeploymentDeleteObj(&flinkApp, "hash") + tmDeployment := FetchTaskMangerDeploymentDeleteObj(&flinkApp, "hash") + service := FetchVersionedJobManagerServiceDeleteObj(&flinkApp, "hash") + + ctr := 0 + mockK8Cluster := flinkControllerForTest.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.DeleteK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + ctr++ + switch ctr { + case 1: + assert.Equal(t, object, jmDeployment) + case 2: + assert.Equal(t, object, tmDeployment) + case 3: + assert.Equal(t, object, service) + } + return nil + } + + err := flinkControllerForTest.DeleteCluster(context.Background(), &flinkApp, "hash") + assert.Nil(t, err) +} + +func TestCreateCluster(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + mockJobManager := flinkControllerForTest.jobManager.(*mock.JobManagerController) + mockTaskManager := flinkControllerForTest.taskManager.(*mock.TaskManagerController) + + mockJobManager.CreateIfNotExistFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) { + return true, nil + } + mockTaskManager.CreateIfNotExistFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) { + return true, nil + } + err := flinkControllerForTest.CreateCluster(context.Background(), &flinkApp) + assert.Nil(t, err) +} + +func TestCreateClusterJmErr(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + mockJobManager := flinkControllerForTest.jobManager.(*mock.JobManagerController) + mockTaskManager := flinkControllerForTest.taskManager.(*mock.TaskManagerController) + + mockJobManager.CreateIfNotExistFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) { + return false, errors.New("jm failed") + } + mockTaskManager.CreateIfNotExistFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) { + assert.False(t, true) + return false, nil + } + err := flinkControllerForTest.CreateCluster(context.Background(), &flinkApp) + assert.EqualError(t, err, "jm failed") +} + +func TestCreateClusterTmErr(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + mockJobManager := flinkControllerForTest.jobManager.(*mock.JobManagerController) + mockTaskManager := flinkControllerForTest.taskManager.(*mock.TaskManagerController) + + mockJobManager.CreateIfNotExistFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) { + return true, nil + } + mockTaskManager.CreateIfNotExistFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) { + return false, errors.New("tm failed") + } + err := flinkControllerForTest.CreateCluster(context.Background(), &flinkApp) + assert.EqualError(t, err, "tm failed") +} + +func TestStartFlinkJob(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + flinkApp.Spec.Parallelism = 4 + flinkApp.Spec.ProgramArgs = "args" + flinkApp.Spec.EntryClass = "class" + flinkApp.Spec.JarName = "jar-name" + flinkApp.Spec.SavepointInfo.SavepointLocation = "location//" + flinkApp.Spec.FlinkVersion = "1.7" + + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + mockJmClient.SubmitJobFunc = func(ctx context.Context, url string, jarID string, submitJobRequest client.SubmitJobRequest) (*client.SubmitJobResponse, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + assert.Equal(t, jarID, "jar-name") + assert.Equal(t, submitJobRequest.Parallelism, int32(4)) + assert.Equal(t, submitJobRequest.ProgramArgs, "args") + assert.Equal(t, submitJobRequest.EntryClass, "class") + assert.Equal(t, submitJobRequest.SavepointPath, "location//") + + return &client.SubmitJobResponse{ + JobID: testJobID, + }, nil + } + jobID, err := flinkControllerForTest.StartFlinkJob(context.Background(), &flinkApp, "hash", + flinkApp.Spec.JarName, flinkApp.Spec.Parallelism, flinkApp.Spec.EntryClass, flinkApp.Spec.ProgramArgs) + assert.Nil(t, err) + assert.Equal(t, jobID, testJobID) +} + +func TestStartFlinkJobEmptyJobID(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + mockJmClient.SubmitJobFunc = func(ctx context.Context, url string, jarID string, submitJobRequest client.SubmitJobRequest) (*client.SubmitJobResponse, error) { + + return &client.SubmitJobResponse{}, nil + } + jobID, err := flinkControllerForTest.StartFlinkJob(context.Background(), &flinkApp, "hash", + flinkApp.Spec.JarName, flinkApp.Spec.Parallelism, flinkApp.Spec.EntryClass, flinkApp.Spec.ProgramArgs) + assert.EqualError(t, err, "unable to submit job: invalid job id") + assert.Empty(t, jobID) +} + +func TestStartFlinkJobErr(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + mockJmClient.SubmitJobFunc = func(ctx context.Context, url string, jarID string, submitJobRequest client.SubmitJobRequest) (*client.SubmitJobResponse, error) { + return nil, errors.New("submit error") + } + jobID, err := flinkControllerForTest.StartFlinkJob(context.Background(), &flinkApp, "hash", + flinkApp.Spec.JarName, flinkApp.Spec.Parallelism, flinkApp.Spec.EntryClass, flinkApp.Spec.ProgramArgs) + assert.EqualError(t, err, "submit error") + assert.Empty(t, jobID) +} + +func TestCancelWithSavepoint(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + mockJmClient.CancelJobWithSavepointFunc = func(ctx context.Context, url string, jobID string) (string, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + assert.Equal(t, jobID, testJobID) + return "t1", nil + } + triggerID, err := flinkControllerForTest.CancelWithSavepoint(context.Background(), &flinkApp, "hash") + assert.Nil(t, err) + assert.Equal(t, triggerID, "t1") +} + +func TestCancelWithSavepointErr(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + mockJmClient.CancelJobWithSavepointFunc = func(ctx context.Context, url string, jobID string) (string, error) { + return "", errors.New("cancel error") + } + triggerID, err := flinkControllerForTest.CancelWithSavepoint(context.Background(), &flinkApp, "hash") + assert.EqualError(t, err, "cancel error") + assert.Empty(t, triggerID) +} + +func TestGetJobsForApplication(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + mockJmClient.GetJobsFunc = func(ctx context.Context, url string) (*client.GetJobsResponse, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + return &client.GetJobsResponse{ + Jobs: []client.FlinkJob{ + { + JobID: testJobID, + }, + }, + }, nil + } + jobs, err := flinkControllerForTest.GetJobsForApplication(context.Background(), &flinkApp, "hash") + assert.Nil(t, err) + assert.Equal(t, 1, len(jobs)) + assert.Equal(t, jobs[0].JobID, testJobID) +} + +func TestGetJobsForApplicationErr(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + mockJmClient.GetJobsFunc = func(ctx context.Context, url string) (*client.GetJobsResponse, error) { + return nil, errors.New("get jobs error") + } + jobs, err := flinkControllerForTest.GetJobsForApplication(context.Background(), &flinkApp, "hash") + assert.EqualError(t, err, "get jobs error") + assert.Nil(t, jobs) +} + +func TestFindExternalizedCheckpoint(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + flinkApp.Status.JobStatus.JobID = "jobid" + + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + mockJmClient.GetLatestCheckpointFunc = func(ctx context.Context, url string, jobId string) (*client.CheckpointStatistics, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + assert.Equal(t, "jobid", jobId) + return &client.CheckpointStatistics{ + TriggerTimestamp: time.Now().Unix(), + ExternalPath: "/tmp/checkpoint", + }, nil + } + + checkpoint, err := flinkControllerForTest.FindExternalizedCheckpoint(context.Background(), &flinkApp, "hash") + assert.Nil(t, err) + assert.Equal(t, "/tmp/checkpoint", checkpoint) +} + +func TestClusterStatusUpdated(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + mockJmClient.GetClusterOverviewFunc = func(ctx context.Context, url string) (*client.ClusterOverviewResponse, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + return &client.ClusterOverviewResponse{ + NumberOfTaskSlots: 1, + SlotsAvailable: 0, + TaskManagerCount: 1, + }, nil + } + + mockJmClient.GetTaskManagersFunc = func(ctx context.Context, url string) (*client.TaskManagersResponse, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + return &client.TaskManagersResponse{ + TaskManagers: []client.TaskManagerStats{ + { + TimeSinceLastHeartbeat: time.Now().UnixNano() / int64(time.Millisecond), + SlotsNumber: 3, + FreeSlots: 0, + }, + }, + }, nil + } + + _, err := flinkControllerForTest.CompareAndUpdateClusterStatus(context.Background(), &flinkApp, "hash") + assert.Nil(t, err) + assert.Equal(t, int32(1), flinkApp.Status.ClusterStatus.NumberOfTaskSlots) + assert.Equal(t, int32(0), flinkApp.Status.ClusterStatus.AvailableTaskSlots) + assert.Equal(t, int32(1), flinkApp.Status.ClusterStatus.HealthyTaskManagers) + assert.Equal(t, v1alpha1.Green, flinkApp.Status.ClusterStatus.Health) + +} + +func TestNoClusterStatusChange(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + flinkApp.Status.ClusterStatus.NumberOfTaskSlots = int32(1) + flinkApp.Status.ClusterStatus.AvailableTaskSlots = int32(0) + flinkApp.Status.ClusterStatus.HealthyTaskManagers = int32(1) + flinkApp.Status.ClusterStatus.Health = v1alpha1.Green + flinkApp.Status.ClusterStatus.NumberOfTaskManagers = int32(1) + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + mockJmClient.GetClusterOverviewFunc = func(ctx context.Context, url string) (*client.ClusterOverviewResponse, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + return &client.ClusterOverviewResponse{ + NumberOfTaskSlots: 1, + SlotsAvailable: 0, + TaskManagerCount: 1, + }, nil + } + + mockJmClient.GetTaskManagersFunc = func(ctx context.Context, url string) (*client.TaskManagersResponse, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + return &client.TaskManagersResponse{ + TaskManagers: []client.TaskManagerStats{ + { + TimeSinceLastHeartbeat: time.Now().UnixNano() / int64(time.Millisecond), + SlotsNumber: 3, + FreeSlots: 0, + }, + }, + }, nil + } + + hasClusterStatusChanged, err := flinkControllerForTest.CompareAndUpdateClusterStatus(context.Background(), &flinkApp, "hash") + assert.Nil(t, err) + assert.False(t, hasClusterStatusChanged) +} + +func TestHealthyTaskmanagers(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + + mockJmClient.GetClusterOverviewFunc = func(ctx context.Context, url string) (*client.ClusterOverviewResponse, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + return &client.ClusterOverviewResponse{ + NumberOfTaskSlots: 1, + SlotsAvailable: 0, + TaskManagerCount: 1, + }, nil + } + + mockJmClient.GetTaskManagersFunc = func(ctx context.Context, url string) (*client.TaskManagersResponse, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + return &client.TaskManagersResponse{ + TaskManagers: []client.TaskManagerStats{ + { + // 1 day old + TimeSinceLastHeartbeat: time.Now().AddDate(0, 0, -1).UnixNano() / int64(time.Millisecond), + SlotsNumber: 3, + FreeSlots: 0, + }, + }, + }, nil + } + + _, err := flinkControllerForTest.CompareAndUpdateClusterStatus(context.Background(), &flinkApp, "hash") + assert.Nil(t, err) + assert.Equal(t, int32(1), flinkApp.Status.ClusterStatus.NumberOfTaskSlots) + assert.Equal(t, int32(0), flinkApp.Status.ClusterStatus.AvailableTaskSlots) + assert.Equal(t, int32(0), flinkApp.Status.ClusterStatus.HealthyTaskManagers) + assert.Equal(t, v1alpha1.Yellow, flinkApp.Status.ClusterStatus.Health) + +} + +func TestJobStatusUpdated(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + flinkApp := getFlinkTestApp() + startTime := metaV1.Now().UnixNano() / int64(time.Millisecond) + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + mockJmClient.GetJobOverviewFunc = func(ctx context.Context, url string, jobID string) (*client.FlinkJobOverview, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + return &client.FlinkJobOverview{ + JobID: "abc", + State: client.Running, + StartTime: startTime, + }, nil + } + + mockJmClient.GetCheckpointCountsFunc = func(ctx context.Context, url string, jobID string) (*client.CheckpointResponse, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + return &client.CheckpointResponse{ + Counts: map[string]int32{ + "restored": 1, + "completed": 4, + "failed": 0, + }, + Latest: client.LatestCheckpoints{ + Restored: &client.CheckpointStatistics{ + RestoredTimeStamp: startTime, + ExternalPath: "/test/externalpath", + }, + + Completed: &client.CheckpointStatistics{ + LatestAckTimestamp: startTime, + }, + }, + }, nil + } + + flinkApp.Status.JobStatus.JobID = "abc" + expectedTime := metaV1.NewTime(time.Unix(startTime/1000, 0)) + _, err := flinkControllerForTest.CompareAndUpdateJobStatus(context.Background(), &flinkApp, "hash") + assert.Nil(t, err) + + assert.Equal(t, v1alpha1.Running, flinkApp.Status.JobStatus.State) + assert.Equal(t, &expectedTime, flinkApp.Status.JobStatus.StartTime) + assert.Equal(t, v1alpha1.Green, flinkApp.Status.JobStatus.Health) + + assert.Equal(t, int32(0), flinkApp.Status.JobStatus.FailedCheckpointCount) + assert.Equal(t, int32(4), flinkApp.Status.JobStatus.CompletedCheckpointCount) + assert.Equal(t, int32(1), flinkApp.Status.JobStatus.JobRestartCount) + assert.Equal(t, &expectedTime, flinkApp.Status.JobStatus.RestoreTime) + assert.Equal(t, "/test/externalpath", flinkApp.Status.JobStatus.RestorePath) + assert.Equal(t, &expectedTime, flinkApp.Status.JobStatus.LastCheckpointTime) + +} + +func TestNoJobStatusChange(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + constTime := time.Now().UnixNano() / int64(time.Millisecond) + metaTime := metaV1.NewTime(time.Unix(constTime/1000, 0)) + app1 := getFlinkTestApp() + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + + app1.Status.JobStatus.State = v1alpha1.Running + app1.Status.JobStatus.StartTime = &metaTime + app1.Status.JobStatus.LastCheckpointTime = &metaTime + app1.Status.JobStatus.CompletedCheckpointCount = int32(4) + app1.Status.JobStatus.JobRestartCount = int32(1) + app1.Status.JobStatus.FailedCheckpointCount = int32(0) + app1.Status.JobStatus.Health = v1alpha1.Green + app1.Status.JobStatus.RestoreTime = &metaTime + app1.Status.JobStatus.RestorePath = "/test/externalpath" + + mockJmClient.GetJobOverviewFunc = func(ctx context.Context, url string, jobID string) (*client.FlinkJobOverview, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + return &client.FlinkJobOverview{ + JobID: "j1", + State: client.Running, + StartTime: constTime, + }, nil + } + + mockJmClient.GetCheckpointCountsFunc = func(ctx context.Context, url string, jobID string) (*client.CheckpointResponse, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + return &client.CheckpointResponse{ + Counts: map[string]int32{ + "restored": 1, + "completed": 4, + "failed": 0, + }, + Latest: client.LatestCheckpoints{ + Restored: &client.CheckpointStatistics{ + RestoredTimeStamp: constTime, + ExternalPath: "/test/externalpath", + }, + + Completed: &client.CheckpointStatistics{ + LatestAckTimestamp: constTime, + }, + }, + }, nil + } + hasJobStatusChanged, err := flinkControllerForTest.CompareAndUpdateJobStatus(context.Background(), &app1, "hash") + assert.Nil(t, err) + assert.False(t, hasJobStatusChanged) + +} + +func TestGetAndUpdateJobStatusHealth(t *testing.T) { + flinkControllerForTest := getTestFlinkController() + lastFailedTime := metaV1.NewTime(time.Now().Add(-10 * time.Second)) + app1 := getFlinkTestApp() + mockJmClient := flinkControllerForTest.flinkClient.(*clientMock.JobManagerClient) + + app1.Status.JobStatus.State = v1alpha1.Failing + app1.Status.JobStatus.LastFailingTime = &lastFailedTime + + mockJmClient.GetJobOverviewFunc = func(ctx context.Context, url string, jobID string) (*client.FlinkJobOverview, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + return &client.FlinkJobOverview{ + JobID: "abc", + State: client.Running, + StartTime: metaV1.Now().UnixNano() / int64(time.Millisecond), + }, nil + } + + mockJmClient.GetCheckpointCountsFunc = func(ctx context.Context, url string, jobID string) (*client.CheckpointResponse, error) { + assert.Equal(t, url, "http://app-name-hash.ns:8081") + return &client.CheckpointResponse{ + Counts: map[string]int32{ + "restored": 1, + "completed": 4, + "failed": 0, + }, + }, nil + } + _, err := flinkControllerForTest.CompareAndUpdateJobStatus(context.Background(), &app1, "hash") + assert.Nil(t, err) + // Job is in a RUNNING state but was in a FAILING state in the last 1 minute, so we expect + // JobStatus.Health to be Red + assert.Equal(t, app1.Status.JobStatus.Health, v1alpha1.Red) + +} diff --git a/pkg/controller/flink/ingress.go b/pkg/controller/flink/ingress.go new file mode 100644 index 00000000..259b4432 --- /dev/null +++ b/pkg/controller/flink/ingress.go @@ -0,0 +1,67 @@ +package flink + +import ( + "regexp" + + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "github.com/lyft/flinkk8soperator/pkg/controller/common" + "github.com/lyft/flinkk8soperator/pkg/controller/config" + "github.com/lyft/flinkk8soperator/pkg/controller/k8" + "k8s.io/api/extensions/v1beta1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" +) + +var inputRegex = regexp.MustCompile(`{{[$]jobCluster}}`) + +func ReplaceJobURL(value string, input string) string { + return inputRegex.ReplaceAllString(value, input) +} + +func GetFlinkUIIngressURL(jobName string) string { + return ReplaceJobURL(config.GetConfig().FlinkIngressURLFormat, jobName) +} + +func FetchJobManagerIngressCreateObj(app *v1alpha1.FlinkApplication) *v1beta1.Ingress { + podLabels := common.DuplicateMap(app.Labels) + podLabels = common.CopyMap(podLabels, k8.GetAppLabel(app.Name)) + + ingressMeta := v1.ObjectMeta{ + Name: app.Name, + Labels: podLabels, + Namespace: app.Namespace, + OwnerReferences: []v1.OwnerReference{ + *v1.NewControllerRef(app, app.GroupVersionKind()), + }, + } + + backend := v1beta1.IngressBackend{ + ServiceName: app.Name, + ServicePort: intstr.IntOrString{ + Type: intstr.Int, + IntVal: getUIPort(app), + }, + } + + ingressSpec := v1beta1.IngressSpec{ + Rules: []v1beta1.IngressRule{{ + Host: GetFlinkUIIngressURL(app.Name), + IngressRuleValue: v1beta1.IngressRuleValue{ + HTTP: &v1beta1.HTTPIngressRuleValue{ + Paths: []v1beta1.HTTPIngressPath{{ + Backend: backend, + }}, + }, + }, + }}, + } + return &v1beta1.Ingress{ + ObjectMeta: ingressMeta, + TypeMeta: v1.TypeMeta{ + APIVersion: v1beta1.SchemeGroupVersion.String(), + Kind: k8.Ingress, + }, + Spec: ingressSpec, + } + +} diff --git a/pkg/controller/flink/ingress_test.go b/pkg/controller/flink/ingress_test.go new file mode 100644 index 00000000..87f9ea95 --- /dev/null +++ b/pkg/controller/flink/ingress_test.go @@ -0,0 +1,27 @@ +package flink + +import ( + "testing" + + config2 "github.com/lyft/flinkk8soperator/pkg/controller/config" + "github.com/stretchr/testify/assert" +) + +func TestReplaceJobUrl(t *testing.T) { + assert.Equal(t, + "ABC.lyft.xyz", + ReplaceJobURL("{{$jobCluster}}.lyft.xyz", "ABC")) +} + +func initTestConfig() error { + return config2.ConfigSection.SetConfig(&config2.Config{ + FlinkIngressURLFormat: "{{$jobCluster}}.lyft.xyz", + }) +} +func TestGetFlinkUIIngressURL(t *testing.T) { + err := initTestConfig() + assert.Nil(t, err) + assert.Equal(t, + "ABC.lyft.xyz", + GetFlinkUIIngressURL("ABC")) +} diff --git a/pkg/controller/flink/job_manager_controller.go b/pkg/controller/flink/job_manager_controller.go new file mode 100644 index 00000000..71a123eb --- /dev/null +++ b/pkg/controller/flink/job_manager_controller.go @@ -0,0 +1,389 @@ +package flink + +import ( + "context" + "fmt" + + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "github.com/lyft/flinkk8soperator/pkg/controller/common" + "github.com/lyft/flinkk8soperator/pkg/controller/config" + "github.com/lyft/flinkk8soperator/pkg/controller/k8" + "github.com/lyft/flytestdlib/logger" + "github.com/lyft/flytestdlib/promutils" + "github.com/lyft/flytestdlib/promutils/labeled" + v1 "k8s.io/api/apps/v1" + coreV1 "k8s.io/api/core/v1" + k8_err "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" +) + +const ( + JobManagerNameFormat = "%s-%s-jm" + JobManagerPodNameFormat = "%s-%s-jm-pod" + JobManagerContainerName = "jobmanager" + JobManagerArg = "jobmanager" + JobManagerReadinessPath = "/config" + JobManagerReadinessInitialDelaySec = 10 + JobManagerReadinessTimeoutSec = 1 + JobManagerReadinessSuccessThreshold = 1 + JobManagerReadinessFailureThreshold = 2 + JobManagerReadinessPeriodSec = 5 +) + +const ( + FlinkRPCPortName = "rpc" + FlinkQueryPortName = "query" + FlinkBlobPortName = "blob" + FlinkUIPortName = "ui" + FlinkInternalMetricPortName = "metrics" +) + +func VersionedJobManagerService(app *v1alpha1.FlinkApplication, hash string) string { + return fmt.Sprintf("%s-%s", app.Name, hash) +} + +type JobManagerControllerInterface interface { + CreateIfNotExist(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) +} + +func NewJobManagerController(k8sCluster k8.ClusterInterface, config config.RuntimeConfig) JobManagerControllerInterface { + metrics := newJobManagerMetrics(config.MetricsScope) + return &JobManagerController{ + k8Cluster: k8sCluster, + metrics: metrics, + } +} + +type JobManagerController struct { + k8Cluster k8.ClusterInterface + metrics *jobManagerMetrics +} + +func newJobManagerMetrics(scope promutils.Scope) *jobManagerMetrics { + jobManagerControllerScope := scope.NewSubScope("job_manager_controller") + return &jobManagerMetrics{ + scope: scope, + deploymentCreationSuccess: labeled.NewCounter("deployment_create_success", "Job manager deployment created successfully", jobManagerControllerScope), + deploymentCreationFailure: labeled.NewCounter("deployment_create_failure", "Job manager deployment creation failed", jobManagerControllerScope), + serviceCreationSuccess: labeled.NewCounter("service_create_success", "Job manager service created successfully", jobManagerControllerScope), + serviceCreationFailure: labeled.NewCounter("service_create_failure", "Job manager service creation failed", jobManagerControllerScope), + ingressCreationSuccess: labeled.NewCounter("ingress_create_success", "Job manager ingress created successfully", jobManagerControllerScope), + ingressCreationFailure: labeled.NewCounter("ingress_create_failure", "Job manager ingress creation failed", jobManagerControllerScope), + } +} + +type jobManagerMetrics struct { + scope promutils.Scope + deploymentCreationSuccess labeled.Counter + deploymentCreationFailure labeled.Counter + serviceCreationSuccess labeled.Counter + serviceCreationFailure labeled.Counter + ingressCreationSuccess labeled.Counter + ingressCreationFailure labeled.Counter +} + +func (j *JobManagerController) CreateIfNotExist(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) { + hash := HashForApplication(application) + newlyCreated := false + + jobManagerDeployment := FetchJobMangerDeploymentCreateObj(application, hash) + err := j.k8Cluster.CreateK8Object(ctx, jobManagerDeployment) + if err != nil { + if !k8_err.IsAlreadyExists(err) { + j.metrics.deploymentCreationFailure.Inc(ctx) + logger.Errorf(ctx, "Jobmanager deployment creation failed %v", err) + return false, err + } + logger.Infof(ctx, "Jobmanager deployment already exists") + } else { + newlyCreated = true + j.metrics.deploymentCreationSuccess.Inc(ctx) + } + + // create the generic job manager service, used by the ingress to provide UI access + // there will only be one of these across the lifetime of the application + genericService := FetchJobManagerServiceCreateObj(application, hash) + err = j.k8Cluster.CreateK8Object(ctx, genericService) + if err != nil { + if !k8_err.IsAlreadyExists(err) { + j.metrics.serviceCreationFailure.Inc(ctx) + logger.Errorf(ctx, "Jobmanager service creation failed %v", err) + return false, err + } + logger.Infof(ctx, "Jobmanager service already exists") + } else { + newlyCreated = true + j.metrics.serviceCreationSuccess.Inc(ctx) + } + + // create the service for _this_ version of the flink application + // this gives us a stable and reliable way to target a particular cluster during upgrades + versionedJobManagerService := FetchJobManagerServiceCreateObj(application, hash) + versionedJobManagerService.Name = VersionedJobManagerService(application, hash) + + err = j.k8Cluster.CreateK8Object(ctx, versionedJobManagerService) + if err != nil { + if !k8_err.IsAlreadyExists(err) { + j.metrics.serviceCreationFailure.Inc(ctx) + logger.Errorf(ctx, "Versioned Jobmanager service creation failed %v", err) + return false, err + } + logger.Infof(ctx, "Vesioned Jobmanager service already exists") + } else { + newlyCreated = true + j.metrics.serviceCreationSuccess.Inc(ctx) + } + + jobManagerIngress := FetchJobManagerIngressCreateObj(application) + err = j.k8Cluster.CreateK8Object(ctx, jobManagerIngress) + if err != nil { + if !k8_err.IsAlreadyExists(err) { + j.metrics.ingressCreationFailure.Inc(ctx) + logger.Errorf(ctx, "Jobmanager ingress creation failed %v", err) + return false, err + } + logger.Infof(ctx, "Jobmanager ingress already exists") + } else { + newlyCreated = true + j.metrics.ingressCreationSuccess.Inc(ctx) + } + + return newlyCreated, nil +} + +var JobManagerDefaultResources = coreV1.ResourceRequirements{ + Requests: coreV1.ResourceList{ + coreV1.ResourceCPU: resource.MustParse("4"), + coreV1.ResourceMemory: resource.MustParse("3072Mi"), + }, + Limits: coreV1.ResourceList{ + coreV1.ResourceCPU: resource.MustParse("4"), + coreV1.ResourceMemory: resource.MustParse("3072Mi"), + }, +} + +func getJobManagerPodName(application *v1alpha1.FlinkApplication, hash string) string { + applicationName := application.Name + return fmt.Sprintf(JobManagerPodNameFormat, applicationName, hash) +} + +func getJobManagerName(application *v1alpha1.FlinkApplication, hash string) string { + applicationName := application.Name + return fmt.Sprintf(JobManagerNameFormat, applicationName, hash) +} + +func FetchVersionedJobManagerServiceDeleteObj(app *v1alpha1.FlinkApplication, hash string) *coreV1.Service { + return &coreV1.Service{ + TypeMeta: metaV1.TypeMeta{ + APIVersion: coreV1.SchemeGroupVersion.String(), + Kind: k8.Service, + }, + ObjectMeta: metaV1.ObjectMeta{ + Name: VersionedJobManagerService(app, hash), + Namespace: app.Namespace, + }, + } +} + +func FetchJobManagerServiceCreateObj(app *v1alpha1.FlinkApplication, hash string) *coreV1.Service { + jmServiceName := app.Name + serviceLabels := getCommonAppLabels(app) + serviceLabels[FlinkAppHash] = hash + serviceLabels[FlinkDeploymentType] = FlinkDeploymentTypeJobmanager + + return &coreV1.Service{ + TypeMeta: metaV1.TypeMeta{ + APIVersion: coreV1.SchemeGroupVersion.String(), + Kind: k8.Service, + }, + ObjectMeta: metaV1.ObjectMeta{ + Name: jmServiceName, + Namespace: app.Namespace, + OwnerReferences: []metaV1.OwnerReference{ + *metaV1.NewControllerRef(app, app.GroupVersionKind()), + }, + }, + Spec: coreV1.ServiceSpec{ + Ports: getJobManagerServicePorts(app), + Selector: serviceLabels, + }, + } +} + +func getJobManagerServicePorts(app *v1alpha1.FlinkApplication) []coreV1.ServicePort { + ports := getJobManagerPorts(app) + servicePorts := make([]coreV1.ServicePort, 0, len(ports)) + for _, p := range ports { + servicePorts = append(servicePorts, coreV1.ServicePort{ + Name: p.Name, + Port: p.ContainerPort, + }) + } + return servicePorts +} + +func getJobManagerPorts(app *v1alpha1.FlinkApplication) []coreV1.ContainerPort { + return []coreV1.ContainerPort{ + { + Name: FlinkRPCPortName, + ContainerPort: getRPCPort(app), + }, + { + Name: FlinkBlobPortName, + ContainerPort: getBlobPort(app), + }, + { + Name: FlinkQueryPortName, + ContainerPort: getQueryPort(app), + }, + { + Name: FlinkUIPortName, + ContainerPort: getUIPort(app), + }, + { + Name: FlinkInternalMetricPortName, + ContainerPort: getInternalMetricsQueryPort(app), + }, + } +} + +func FetchJobManagerContainerObj(application *v1alpha1.FlinkApplication) *coreV1.Container { + jmConfig := application.Spec.JobManagerConfig + resources := jmConfig.Resources + if resources == nil { + resources = &JobManagerDefaultResources + } + + ports := getJobManagerPorts(application) + operatorEnv := GetFlinkContainerEnv(application) + operatorEnv = append(operatorEnv, jmConfig.Environment.Env...) + + return &coreV1.Container{ + Name: getFlinkContainerName(JobManagerContainerName), + Image: application.Spec.Image, + ImagePullPolicy: ImagePullPolicy(application), + Resources: *resources, + Args: []string{JobManagerArg}, + Ports: ports, + Env: operatorEnv, + EnvFrom: jmConfig.Environment.EnvFrom, + VolumeMounts: application.Spec.VolumeMounts, + ReadinessProbe: &coreV1.Probe{ + Handler: coreV1.Handler{ + HTTPGet: &coreV1.HTTPGetAction{ + Path: JobManagerReadinessPath, + Port: intstr.FromInt(int(getUIPort(application))), + }, + }, + InitialDelaySeconds: JobManagerReadinessInitialDelaySec, + TimeoutSeconds: JobManagerReadinessTimeoutSec, + SuccessThreshold: JobManagerReadinessSuccessThreshold, + FailureThreshold: JobManagerReadinessFailureThreshold, + PeriodSeconds: JobManagerReadinessPeriodSec, + }, + } +} + +func DeploymentIsJobmanager(deployment *v1.Deployment) bool { + return deployment.Labels[FlinkDeploymentType] == FlinkDeploymentTypeJobmanager +} + +func FetchJobMangerDeploymentDeleteObj(app *v1alpha1.FlinkApplication, hash string) *v1.Deployment { + return &v1.Deployment{ + TypeMeta: metaV1.TypeMeta{ + APIVersion: v1.SchemeGroupVersion.String(), + Kind: k8.Deployment, + }, + ObjectMeta: metaV1.ObjectMeta{ + Namespace: app.Namespace, + Name: getJobManagerName(app, hash), + }, + } +} + +// Translates a FlinkApplication into a JobManager deployment. Changes to this function must be +// made very carefully. Any new version v' that causes DeploymentsEqual(v(x), v'(x)) to be false +// will cause redeployments for all applications, and should be considered a breaking change that +// requires a new version of the CRD. +func jobmanagerTemplate(app *v1alpha1.FlinkApplication) *v1.Deployment { + labels := getCommonAppLabels(app) + labels = common.CopyMap(labels, app.Labels) + labels[FlinkDeploymentType] = FlinkDeploymentTypeJobmanager + + podSelector := &metaV1.LabelSelector{ + MatchLabels: labels, + } + + replicas := getJobmanagerReplicas(app) + jobManagerContainer := FetchJobManagerContainerObj(app) + + return &v1.Deployment{ + TypeMeta: metaV1.TypeMeta{ + APIVersion: v1.SchemeGroupVersion.String(), + Kind: k8.Deployment, + }, + ObjectMeta: metaV1.ObjectMeta{ + Namespace: app.Namespace, + Labels: labels, + Annotations: getCommonAnnotations(app), + OwnerReferences: []metaV1.OwnerReference{ + *metaV1.NewControllerRef(app, app.GroupVersionKind()), + }, + }, + Spec: v1.DeploymentSpec{ + Selector: podSelector, + Strategy: v1.DeploymentStrategy{ + Type: v1.RecreateDeploymentStrategyType, + }, + Replicas: &replicas, + Template: coreV1.PodTemplateSpec{ + ObjectMeta: metaV1.ObjectMeta{ + Namespace: app.Namespace, + Labels: labels, + Annotations: app.Annotations, + }, + Spec: coreV1.PodSpec{ + Containers: []coreV1.Container{ + *jobManagerContainer, + }, + Volumes: app.Spec.Volumes, + ImagePullSecrets: app.Spec.ImagePullSecrets, + }, + }, + }, + } +} + +func FetchJobMangerDeploymentCreateObj(app *v1alpha1.FlinkApplication, hash string) *v1.Deployment { + template := jobmanagerTemplate(app.DeepCopy()) + + template.Name = getJobManagerName(app, hash) + template.Labels[FlinkAppHash] = hash + template.Spec.Template.Labels[FlinkAppHash] = hash + template.Spec.Selector.MatchLabels[FlinkAppHash] = hash + template.Spec.Template.Name = getJobManagerPodName(app, hash) + + InjectHashesIntoConfig(template, app, hash) + + return template +} + +func JobManagerDeploymentMatches(deployment *v1.Deployment, application *v1alpha1.FlinkApplication) bool { + deploymentFromApp := FetchJobMangerDeploymentCreateObj(application, HashForApplication(application)) + return DeploymentsEqual(deploymentFromApp, deployment) +} + +func getJobManagerCount(deployments []v1.Deployment, application *v1alpha1.FlinkApplication) int32 { + jobManagerDeployment := getJobManagerDeployment(deployments, application) + if jobManagerDeployment == nil { + return 0 + } + return *jobManagerDeployment.Spec.Replicas +} + +func getJobManagerDeployment(deployments []v1.Deployment, application *v1alpha1.FlinkApplication) *v1.Deployment { + jmDeploymentName := getJobManagerName(application, HashForApplication(application)) + return k8.GetDeploymentWithName(deployments, jmDeploymentName) +} diff --git a/pkg/controller/flink/job_manager_controller_test.go b/pkg/controller/flink/job_manager_controller_test.go new file mode 100644 index 00000000..5506b74b --- /dev/null +++ b/pkg/controller/flink/job_manager_controller_test.go @@ -0,0 +1,162 @@ +package flink + +import ( + "testing" + + k8mock "github.com/lyft/flinkk8soperator/pkg/controller/k8/mock" + mockScope "github.com/lyft/flytestdlib/promutils" + + "context" + + "github.com/lyft/flinkk8soperator/pkg/controller/common" + "github.com/lyft/flytestdlib/promutils/labeled" + "github.com/pkg/errors" + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/apps/v1" + coreV1 "k8s.io/api/core/v1" + "k8s.io/api/extensions/v1beta1" + k8sErrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +func getJMControllerForTest() JobManagerController { + testScope := mockScope.NewTestScope() + labeled.SetMetricKeys(common.GetValidLabelNames()...) + + return JobManagerController{ + metrics: newJobManagerMetrics(testScope), + k8Cluster: &k8mock.K8Cluster{}, + } +} + +func TestGetJobManagerName(t *testing.T) { + app := getFlinkTestApp() + assert.Equal(t, "app-name-"+testAppHash+"-jm", getJobManagerName(&app, testAppHash)) +} + +func TestGetJobManagerPodName(t *testing.T) { + app := getFlinkTestApp() + assert.Equal(t, "app-name-"+testAppHash+"-jm-pod", getJobManagerPodName(&app, testAppHash)) +} + +func TestGetJobManagerDeployment(t *testing.T) { + app := getFlinkTestApp() + deployment := v1.Deployment{} + deployment.Name = getJobManagerName(&app, testAppHash) + deployments := []v1.Deployment{ + deployment, + } + assert.Equal(t, deployment, *getJobManagerDeployment(deployments, &app)) +} + +func TestGetJobManagerReplicaCount(t *testing.T) { + app := getFlinkTestApp() + deployment := v1.Deployment{} + deployment.Name = getJobManagerName(&app, HashForApplication(&app)) + replicaCount := int32(2) + deployment.Spec.Replicas = &replicaCount + deployments := []v1.Deployment{ + deployment, + } + assert.Equal(t, int32(2), getJobManagerCount(deployments, &app)) +} + +func TestJobManagerCreateSuccess(t *testing.T) { + testController := getJMControllerForTest() + app := getFlinkTestApp() + app.Spec.JarName = "test.jar" + app.Spec.EntryClass = "com.test.MainClass" + app.Spec.ProgramArgs = "--test" + annotations := map[string]string{ + "key": "annotation", + "flink-job-properties": "jarName: test.jar\nparallelism: 8\nentryClass:com.test.MainClass\nprogramArgs:\"--test\"", + } + app.Annotations = annotations + hash := "922eff1b" + expectedLabels := map[string]string{ + "flink-app": "app-name", + "flink-app-hash": hash, + "flink-deployment-type": "jobmanager", + } + ctr := 0 + mockK8Cluster := testController.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.CreateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + ctr++ + switch ctr { + case 1: + deployment := object.(*v1.Deployment) + assert.Equal(t, getJobManagerName(&app, hash), deployment.Name) + assert.Equal(t, app.Namespace, deployment.Namespace) + assert.Equal(t, getJobManagerPodName(&app, hash), deployment.Spec.Template.Name) + assert.Equal(t, annotations, deployment.Annotations) + assert.Equal(t, annotations, deployment.Spec.Template.Annotations) + assert.Equal(t, app.Namespace, deployment.Spec.Template.Namespace) + assert.Equal(t, expectedLabels, deployment.Labels) + assert.Equal(t, int32(1), *deployment.Spec.Replicas) + assert.Equal(t, "app-name", deployment.OwnerReferences[0].Name) + assert.Equal(t, "flink.k8s.io/v1alpha1", deployment.OwnerReferences[0].APIVersion) + assert.Equal(t, "FlinkApplication", deployment.OwnerReferences[0].Kind) + + assert.Equal(t, "blob.server.port: 6125\njobmanager.heap.size: 1536\n"+ + "jobmanager.rpc.port: 6123\n"+ + "jobmanager.web.port: 8081\nmetrics.internal.query-service.port: 50101\n"+ + "query.server.port: 6124\ntaskmanager.heap.size: 512\n"+ + "taskmanager.numberOfTaskSlots: 16\n\n"+ + "high-availability.cluster-id: app-name-"+hash+"\n"+ + "jobmanager.rpc.address: app-name-"+hash+"\n", + common.GetEnvVar(deployment.Spec.Template.Spec.Containers[0].Env, + "OPERATOR_FLINK_CONFIG").Value) + case 2: + service := object.(*coreV1.Service) + assert.Equal(t, app.Name, service.Name) + assert.Equal(t, app.Namespace, service.Namespace) + assert.Equal(t, map[string]string{"flink-app": "app-name", "flink-app-hash": hash, "flink-deployment-type": "jobmanager"}, service.Spec.Selector) + case 3: + service := object.(*coreV1.Service) + assert.Equal(t, app.Name+"-"+hash, service.Name) + assert.Equal(t, "app-name", service.OwnerReferences[0].Name) + assert.Equal(t, app.Namespace, service.Namespace) + assert.Equal(t, map[string]string{"flink-app": "app-name", "flink-app-hash": hash, "flink-deployment-type": "jobmanager"}, service.Spec.Selector) + case 4: + labels := map[string]string{ + "flink-app": "app-name", + } + ingress := object.(*v1beta1.Ingress) + assert.Equal(t, app.Name, ingress.Name) + assert.Equal(t, app.Namespace, ingress.Namespace) + assert.Equal(t, labels, ingress.Labels) + } + return nil + } + newlyCreated, err := testController.CreateIfNotExist(context.Background(), &app) + assert.Nil(t, err) + assert.True(t, newlyCreated) +} + +func TestJobManagerCreateErr(t *testing.T) { + testController := getJMControllerForTest() + app := getFlinkTestApp() + mockK8Cluster := testController.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.CreateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + return errors.New("create error") + } + newlyCreated, err := testController.CreateIfNotExist(context.Background(), &app) + assert.EqualError(t, err, "create error") + assert.False(t, newlyCreated) +} + +func TestJobManagerCreateAlreadyExists(t *testing.T) { + testController := getJMControllerForTest() + app := getFlinkTestApp() + mockK8Cluster := testController.k8Cluster.(*k8mock.K8Cluster) + ctr := 0 + mockK8Cluster.CreateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + ctr++ + return k8sErrors.NewAlreadyExists(schema.GroupResource{}, "") + } + newlyCreated, err := testController.CreateIfNotExist(context.Background(), &app) + assert.Equal(t, ctr, 4) + assert.Nil(t, err) + assert.False(t, newlyCreated) +} diff --git a/pkg/controller/flink/mock/mock_flink.go b/pkg/controller/flink/mock/mock_flink.go new file mode 100644 index 00000000..f11d6f4d --- /dev/null +++ b/pkg/controller/flink/mock/mock_flink.go @@ -0,0 +1,141 @@ +package mock + +import ( + "context" + + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "github.com/lyft/flinkk8soperator/pkg/controller/common" + "github.com/lyft/flinkk8soperator/pkg/controller/flink/client" + "github.com/lyft/flinkk8soperator/pkg/controller/k8" + corev1 "k8s.io/api/core/v1" +) + +type CreateClusterFunc func(ctx context.Context, application *v1alpha1.FlinkApplication) error +type DeleteClusterFunc func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) error +type CancelWithSavepointFunc func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (string, error) +type ForceCancelFunc func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) error +type StartFlinkJobFunc func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string, + jarName string, parallelism int32, entryClass string, programArgs string) (string, error) +type GetSavepointStatusFunc func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (*client.SavepointResponse, error) +type IsClusterReadyFunc func(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) +type IsServiceReadyFunc func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (bool, error) +type GetJobsForApplicationFunc func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) ([]client.FlinkJob, error) +type GetCurrentAndOldDeploymentsForAppFunc func(ctx context.Context, application *v1alpha1.FlinkApplication) (*common.FlinkDeployment, []common.FlinkDeployment, error) +type FindExternalizedCheckpointFunc func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (string, error) +type CompareAndUpdateClusterStatusFunc func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (bool, error) +type CompareAndUpdateJobStatusFunc func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (bool, error) + +type FlinkController struct { + CreateClusterFunc CreateClusterFunc + DeleteClusterFunc DeleteClusterFunc + CancelWithSavepointFunc CancelWithSavepointFunc + ForceCancelFunc ForceCancelFunc + StartFlinkJobFunc StartFlinkJobFunc + GetSavepointStatusFunc GetSavepointStatusFunc + IsClusterReadyFunc IsClusterReadyFunc + IsServiceReadyFunc IsServiceReadyFunc + GetJobsForApplicationFunc GetJobsForApplicationFunc + GetCurrentAndOldDeploymentsForAppFunc GetCurrentAndOldDeploymentsForAppFunc + FindExternalizedCheckpointFunc FindExternalizedCheckpointFunc + Events []corev1.Event + CompareAndUpdateClusterStatusFunc CompareAndUpdateClusterStatusFunc + CompareAndUpdateJobStatusFunc CompareAndUpdateJobStatusFunc +} + +func (m *FlinkController) GetCurrentAndOldDeploymentsForApp(ctx context.Context, application *v1alpha1.FlinkApplication) (*common.FlinkDeployment, []common.FlinkDeployment, error) { + if m.GetCurrentAndOldDeploymentsForAppFunc != nil { + return m.GetCurrentAndOldDeploymentsForAppFunc(ctx, application) + } + return nil, nil, nil +} + +func (m *FlinkController) DeleteCluster(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) error { + if m.DeleteClusterFunc != nil { + return m.DeleteClusterFunc(ctx, application, hash) + } + return nil +} + +func (m *FlinkController) CreateCluster(ctx context.Context, application *v1alpha1.FlinkApplication) error { + if m.CreateClusterFunc != nil { + return m.CreateClusterFunc(ctx, application) + } + return nil +} + +func (m *FlinkController) CancelWithSavepoint(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (string, error) { + if m.CancelWithSavepointFunc != nil { + return m.CancelWithSavepointFunc(ctx, application, hash) + } + return "", nil +} + +func (m *FlinkController) ForceCancel(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) error { + if m.ForceCancelFunc != nil { + return m.ForceCancelFunc(ctx, application, hash) + } + return nil +} + +func (m *FlinkController) StartFlinkJob(ctx context.Context, application *v1alpha1.FlinkApplication, hash string, + jarName string, parallelism int32, entryClass string, programArgs string) (string, error) { + if m.StartFlinkJobFunc != nil { + return m.StartFlinkJobFunc(ctx, application, hash, jarName, parallelism, entryClass, programArgs) + } + return "", nil +} + +func (m *FlinkController) GetSavepointStatus(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (*client.SavepointResponse, error) { + if m.GetSavepointStatusFunc != nil { + return m.GetSavepointStatusFunc(ctx, application, hash) + } + return nil, nil +} + +func (m *FlinkController) IsClusterReady(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) { + if m.IsClusterReadyFunc != nil { + return m.IsClusterReadyFunc(ctx, application) + } + return false, nil +} + +func (m *FlinkController) IsServiceReady(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (bool, error) { + if m.IsServiceReadyFunc != nil { + return m.IsServiceReadyFunc(ctx, application, hash) + } + return false, nil +} + +func (m *FlinkController) GetJobsForApplication(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) ([]client.FlinkJob, error) { + if m.GetJobsForApplicationFunc != nil { + return m.GetJobsForApplicationFunc(ctx, application, hash) + } + return nil, nil +} + +func (m *FlinkController) FindExternalizedCheckpoint(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (string, error) { + if m.FindExternalizedCheckpointFunc != nil { + return m.FindExternalizedCheckpointFunc(ctx, application, hash) + } + return "", nil +} + +func (m *FlinkController) LogEvent(ctx context.Context, app *v1alpha1.FlinkApplication, fieldPath string, eventType string, message string) { + m.Events = append(m.Events, k8.CreateEvent(app, fieldPath, eventType, "Test", message)) +} + +func (m *FlinkController) CompareAndUpdateClusterStatus(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (bool, error) { + if m.CompareAndUpdateClusterStatusFunc != nil { + return m.CompareAndUpdateClusterStatusFunc(ctx, application, hash) + } + + return false, nil +} + +func (m *FlinkController) CompareAndUpdateJobStatus(ctx context.Context, app *v1alpha1.FlinkApplication, hash string) (bool, error) { + if m.CompareAndUpdateJobStatusFunc != nil { + return m.CompareAndUpdateJobStatusFunc(ctx, app, hash) + } + + return false, nil +} diff --git a/pkg/controller/flink/mock/mock_job_manager_controller.go b/pkg/controller/flink/mock/mock_job_manager_controller.go new file mode 100644 index 00000000..af74fa07 --- /dev/null +++ b/pkg/controller/flink/mock/mock_job_manager_controller.go @@ -0,0 +1,20 @@ +package mock + +import ( + "context" + + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" +) + +type JobManagerController struct { + CreateIfNotExistFunc CreateIfNotExistFunc +} + +func (m *JobManagerController) CreateIfNotExist( + ctx context.Context, + application *v1alpha1.FlinkApplication) (bool, error) { + if m.CreateIfNotExistFunc != nil { + return m.CreateIfNotExistFunc(ctx, application) + } + return false, nil +} diff --git a/pkg/controller/flink/mock/mock_task_manager_controller.go b/pkg/controller/flink/mock/mock_task_manager_controller.go new file mode 100644 index 00000000..06275e98 --- /dev/null +++ b/pkg/controller/flink/mock/mock_task_manager_controller.go @@ -0,0 +1,21 @@ +package mock + +import ( + "context" + + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" +) + +type CreateIfNotExistFunc func(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) + +type TaskManagerController struct { + CreateIfNotExistFunc CreateIfNotExistFunc +} + +func (m *TaskManagerController) CreateIfNotExist( + ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) { + if m.CreateIfNotExistFunc != nil { + return m.CreateIfNotExistFunc(ctx, application) + } + return false, nil +} diff --git a/pkg/controller/flink/task_manager_controller.go b/pkg/controller/flink/task_manager_controller.go new file mode 100644 index 00000000..7b06d891 --- /dev/null +++ b/pkg/controller/flink/task_manager_controller.go @@ -0,0 +1,264 @@ +package flink + +import ( + "context" + "fmt" + "math" + + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "github.com/lyft/flinkk8soperator/pkg/controller/common" + "github.com/lyft/flinkk8soperator/pkg/controller/config" + "github.com/lyft/flinkk8soperator/pkg/controller/k8" + "github.com/lyft/flytestdlib/logger" + "github.com/lyft/flytestdlib/promutils" + "github.com/lyft/flytestdlib/promutils/labeled" + v1 "k8s.io/api/apps/v1" + coreV1 "k8s.io/api/core/v1" + k8_err "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + TaskManagerNameFormat = "%s-%s-tm" + TaskManagerPodNameFormat = "%s-%s-tm-pod" + TaskManagerContainerName = "taskmanager" + TaskManagerArg = "taskmanager" + TaskManagerHostnameEnvVar = "TASKMANAGER_HOSTNAME" +) + +type TaskManagerControllerInterface interface { + CreateIfNotExist(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) +} + +func NewTaskManagerController(k8sCluster k8.ClusterInterface, config config.RuntimeConfig) TaskManagerControllerInterface { + metrics := newTaskManagerMetrics(config.MetricsScope) + return &TaskManagerController{ + k8Cluster: k8sCluster, + metrics: metrics, + } +} + +type TaskManagerController struct { + k8Cluster k8.ClusterInterface + metrics *taskManagerMetrics +} + +func newTaskManagerMetrics(scope promutils.Scope) *taskManagerMetrics { + taskManagerControllerScope := scope.NewSubScope("task_manager_controller") + return &taskManagerMetrics{ + scope: scope, + deploymentCreationSuccess: labeled.NewCounter("deployment_create_success", "Task manager deployment created successfully", taskManagerControllerScope), + deploymentCreationFailure: labeled.NewCounter("deployment_create_failure", "Task manager deployment creation failed", taskManagerControllerScope), + } +} + +type taskManagerMetrics struct { + scope promutils.Scope + deploymentCreationSuccess labeled.Counter + deploymentCreationFailure labeled.Counter +} + +var TaskManagerDefaultResources = coreV1.ResourceRequirements{ + Requests: coreV1.ResourceList{ + coreV1.ResourceCPU: resource.MustParse("2"), + coreV1.ResourceMemory: resource.MustParse("1024Mi"), + }, + Limits: coreV1.ResourceList{ + coreV1.ResourceCPU: resource.MustParse("2"), + coreV1.ResourceMemory: resource.MustParse("1024Mi"), + }, +} + +func (t *TaskManagerController) CreateIfNotExist(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) { + hash := HashForApplication(application) + + taskManagerDeployment := FetchTaskMangerDeploymentCreateObj(application, hash) + err := t.k8Cluster.CreateK8Object(ctx, taskManagerDeployment) + if err != nil { + if !k8_err.IsAlreadyExists(err) { + logger.Errorf(ctx, "Taskmanager deployment creation failed %v", err) + t.metrics.deploymentCreationFailure.Inc(ctx) + return false, err + } + logger.Infof(ctx, "Taskmanager deployment already exists") + } else { + t.metrics.deploymentCreationSuccess.Inc(ctx) + return true, nil + } + + return false, nil +} + +func getTaskManagerDeployment(deployments []v1.Deployment, application *v1alpha1.FlinkApplication) *v1.Deployment { + tmDeploymentName := getTaskManagerName(application, HashForApplication(application)) + return k8.GetDeploymentWithName(deployments, tmDeploymentName) +} + +func getTaskManagerCount(deployments []v1.Deployment, application *v1alpha1.FlinkApplication) int32 { + taskManagerDeployment := getTaskManagerDeployment(deployments, application) + if taskManagerDeployment == nil { + return 0 + } + return *taskManagerDeployment.Spec.Replicas +} + +func GetTaskManagerPorts(app *v1alpha1.FlinkApplication) []coreV1.ContainerPort { + return []coreV1.ContainerPort{ + { + Name: FlinkRPCPortName, + ContainerPort: getRPCPort(app), + }, + { + Name: FlinkBlobPortName, + ContainerPort: getBlobPort(app), + }, + { + Name: FlinkQueryPortName, + ContainerPort: getQueryPort(app), + }, + { + Name: FlinkInternalMetricPortName, + ContainerPort: getInternalMetricsQueryPort(app), + }, + } +} + +func FetchTaskManagerContainerObj(application *v1alpha1.FlinkApplication) *coreV1.Container { + tmConfig := application.Spec.TaskManagerConfig + ports := GetTaskManagerPorts(application) + resources := tmConfig.Resources + if resources == nil { + resources = &TaskManagerDefaultResources + } + + operatorEnv := GetFlinkContainerEnv(application) + + operatorEnv = append(operatorEnv, coreV1.EnvVar{ + Name: TaskManagerHostnameEnvVar, + ValueFrom: &coreV1.EnvVarSource{ + FieldRef: &coreV1.ObjectFieldSelector{ + FieldPath: "status.podIP", + }, + }, + }) + + operatorEnv = append(operatorEnv, tmConfig.Environment.Env...) + + return &coreV1.Container{ + Name: getFlinkContainerName(TaskManagerContainerName), + Image: application.Spec.Image, + ImagePullPolicy: ImagePullPolicy(application), + Resources: *resources, + Args: []string{TaskManagerArg}, + Ports: ports, + Env: operatorEnv, + EnvFrom: tmConfig.Environment.EnvFrom, + VolumeMounts: application.Spec.VolumeMounts, + } +} + +func getTaskManagerPodName(application *v1alpha1.FlinkApplication, hash string) string { + applicationName := application.Name + return fmt.Sprintf(TaskManagerPodNameFormat, applicationName, hash) +} + +func getTaskManagerName(application *v1alpha1.FlinkApplication, hash string) string { + applicationName := application.Name + return fmt.Sprintf(TaskManagerNameFormat, applicationName, hash) +} + +func computeTaskManagerReplicas(application *v1alpha1.FlinkApplication) int32 { + slots := getTaskmanagerSlots(application) + parallelism := application.Spec.Parallelism + return int32(math.Ceil(float64(parallelism) / float64(slots))) +} + +func DeploymentIsTaskmanager(deployment *v1.Deployment) bool { + return deployment.Labels[FlinkDeploymentType] == FlinkDeploymentTypeTaskmanager +} + +func FetchTaskMangerDeploymentDeleteObj(app *v1alpha1.FlinkApplication, hash string) *v1.Deployment { + return &v1.Deployment{ + TypeMeta: metaV1.TypeMeta{ + APIVersion: v1.SchemeGroupVersion.String(), + Kind: k8.Deployment, + }, + ObjectMeta: metaV1.ObjectMeta{ + Namespace: app.Namespace, + Name: getTaskManagerName(app, hash), + }, + } +} + +// Translates a FlinkApplication into a TaskManager deployment. Changes to this function must be +// made very carefully. Any new version v' that causes DeploymentsEqual(v(x), v'(x)) to be false +// will cause redeployments for all applications, and should be considered a breaking change that +// requires a new version of the CRD. +func taskmanagerTemplate(app *v1alpha1.FlinkApplication) *v1.Deployment { + labels := getCommonAppLabels(app) + labels = common.CopyMap(labels, app.Labels) + labels[FlinkDeploymentType] = FlinkDeploymentTypeTaskmanager + + podSelector := &metaV1.LabelSelector{ + MatchLabels: labels, + } + + taskContainer := FetchTaskManagerContainerObj(app) + + replicas := computeTaskManagerReplicas(app) + return &v1.Deployment{ + TypeMeta: metaV1.TypeMeta{ + APIVersion: v1.SchemeGroupVersion.String(), + Kind: k8.Deployment, + }, + ObjectMeta: metaV1.ObjectMeta{ + Namespace: app.Namespace, + Labels: labels, + Annotations: getCommonAnnotations(app), + OwnerReferences: []metaV1.OwnerReference{ + *metaV1.NewControllerRef(app, app.GroupVersionKind()), + }, + }, + Spec: v1.DeploymentSpec{ + Selector: podSelector, + Strategy: v1.DeploymentStrategy{ + Type: v1.RecreateDeploymentStrategyType, + }, + Replicas: &replicas, + Template: coreV1.PodTemplateSpec{ + ObjectMeta: metaV1.ObjectMeta{ + Namespace: app.Namespace, + Labels: labels, + Annotations: app.Annotations, + }, + Spec: coreV1.PodSpec{ + Containers: []coreV1.Container{ + *taskContainer, + }, + Volumes: app.Spec.Volumes, + ImagePullSecrets: app.Spec.ImagePullSecrets, + }, + }, + }, + } +} + +func FetchTaskMangerDeploymentCreateObj(app *v1alpha1.FlinkApplication, hash string) *v1.Deployment { + template := taskmanagerTemplate(app.DeepCopy()) + + template.Name = getTaskManagerName(app, hash) + template.Labels[FlinkAppHash] = hash + template.Spec.Template.Labels[FlinkAppHash] = hash + template.Spec.Selector.MatchLabels[FlinkAppHash] = hash + template.Spec.Template.Name = getTaskManagerPodName(app, hash) + + InjectHashesIntoConfig(template, app, hash) + + return template +} + +func TaskManagerDeploymentMatches(deployment *v1.Deployment, application *v1alpha1.FlinkApplication) bool { + deploymentFromApp := FetchTaskMangerDeploymentCreateObj(application, HashForApplication(application)) + return DeploymentsEqual(deploymentFromApp, deployment) +} diff --git a/pkg/controller/flink/task_manager_controller_test.go b/pkg/controller/flink/task_manager_controller_test.go new file mode 100644 index 00000000..daa76683 --- /dev/null +++ b/pkg/controller/flink/task_manager_controller_test.go @@ -0,0 +1,143 @@ +package flink + +import ( + "testing" + + k8mock "github.com/lyft/flinkk8soperator/pkg/controller/k8/mock" + mockScope "github.com/lyft/flytestdlib/promutils" + + "context" + + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "github.com/lyft/flinkk8soperator/pkg/controller/common" + "github.com/lyft/flytestdlib/promutils/labeled" + "github.com/pkg/errors" + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/apps/v1" + k8sErrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +func getTMControllerForTest() TaskManagerController { + testScope := mockScope.NewTestScope() + labeled.SetMetricKeys(common.GetValidLabelNames()...) + + return TaskManagerController{ + metrics: newTaskManagerMetrics(testScope), + k8Cluster: &k8mock.K8Cluster{}, + } +} + +func TestComputeTaskManagerReplicas(t *testing.T) { + app := v1alpha1.FlinkApplication{} + taskSlots := int32(4) + app.Spec.TaskManagerConfig.TaskSlots = &taskSlots + app.Spec.Parallelism = 9 + app.Spec.FlinkVersion = "1.7" + + assert.Equal(t, int32(3), computeTaskManagerReplicas(&app)) +} + +func TestGetTaskManagerName(t *testing.T) { + app := getFlinkTestApp() + assert.Equal(t, "app-name-"+testAppHash+"-tm", getTaskManagerName(&app, testAppHash)) +} + +func TestGetTaskManagerPodName(t *testing.T) { + app := getFlinkTestApp() + assert.Equal(t, "app-name-"+testAppHash+"-tm-pod", getTaskManagerPodName(&app, testAppHash)) +} + +func TestGetTaskManagerDeployment(t *testing.T) { + app := getFlinkTestApp() + deployment := v1.Deployment{} + deployment.Name = getTaskManagerName(&app, testAppHash) + deployments := []v1.Deployment{ + deployment, + } + assert.Equal(t, deployment, *getTaskManagerDeployment(deployments, &app)) +} + +func TestGetTaskManagerReplicaCount(t *testing.T) { + app := getFlinkTestApp() + deployment := v1.Deployment{} + deployment.Name = getTaskManagerName(&app, testAppHash) + replicaCount := int32(2) + deployment.Spec.Replicas = &replicaCount + deployments := []v1.Deployment{ + deployment, + } + assert.Equal(t, int32(2), getTaskManagerCount(deployments, &app)) +} + +func TestTaskManagerCreateSuccess(t *testing.T) { + testController := getTMControllerForTest() + app := getFlinkTestApp() + app.Spec.JarName = "test.jar" + app.Spec.EntryClass = "com.test.MainClass" + app.Spec.ProgramArgs = "--test" + annotations := map[string]string{ + "key": "annotation", + "flink-job-properties": "jarName: test.jar\nparallelism: 8\nentryClass:com.test.MainClass\nprogramArgs:\"--test\"", + } + + hash := "922eff1b" + + app.Annotations = annotations + expectedLabels := map[string]string{ + "flink-app": "app-name", + "flink-app-hash": hash, + "flink-deployment-type": "taskmanager", + } + mockK8Cluster := testController.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.CreateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + deployment := object.(*v1.Deployment) + assert.Equal(t, getTaskManagerName(&app, hash), deployment.Name) + assert.Equal(t, app.Namespace, deployment.Namespace) + assert.Equal(t, getTaskManagerPodName(&app, hash), deployment.Spec.Template.Name) + assert.Equal(t, annotations, deployment.Annotations) + assert.Equal(t, annotations, deployment.Spec.Template.Annotations) + assert.Equal(t, app.Namespace, deployment.Spec.Template.Namespace) + assert.Equal(t, expectedLabels, deployment.Labels) + + assert.Equal(t, "blob.server.port: 6125\njobmanager.heap.size: 1536\n"+ + "jobmanager.rpc.port: 6123\n"+ + "jobmanager.web.port: 8081\nmetrics.internal.query-service.port: 50101\n"+ + "query.server.port: 6124\ntaskmanager.heap.size: 512\n"+ + "taskmanager.numberOfTaskSlots: 16\n\n"+ + "high-availability.cluster-id: app-name-"+hash+"\n"+ + "jobmanager.rpc.address: app-name-"+hash+"\n", + common.GetEnvVar(deployment.Spec.Template.Spec.Containers[0].Env, + "OPERATOR_FLINK_CONFIG").Value) + + return nil + } + newlyCreated, err := testController.CreateIfNotExist(context.Background(), &app) + assert.Nil(t, err) + assert.True(t, newlyCreated) +} + +func TestTaskManagerCreateErr(t *testing.T) { + testController := getTMControllerForTest() + app := getFlinkTestApp() + mockK8Cluster := testController.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.CreateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + return errors.New("create error") + } + newlyCreated, err := testController.CreateIfNotExist(context.Background(), &app) + assert.EqualError(t, err, "create error") + assert.False(t, newlyCreated) +} + +func TestTaskManagerCreateAlreadyExists(t *testing.T) { + testController := getTMControllerForTest() + app := getFlinkTestApp() + mockK8Cluster := testController.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.CreateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + return k8sErrors.NewAlreadyExists(schema.GroupResource{}, "") + } + newlyCreated, err := testController.CreateIfNotExist(context.Background(), &app) + assert.Nil(t, err) + assert.False(t, newlyCreated) +} diff --git a/pkg/controller/flinkapplication/controller.go b/pkg/controller/flinkapplication/controller.go new file mode 100644 index 00000000..30c59252 --- /dev/null +++ b/pkg/controller/flinkapplication/controller.go @@ -0,0 +1,182 @@ +package flinkapplication + +import ( + "context" + + "github.com/lyft/flytestdlib/promutils" + "github.com/lyft/flytestdlib/promutils/labeled" + + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "github.com/lyft/flinkk8soperator/pkg/controller/config" + "sigs.k8s.io/controller-runtime/pkg/controller" + + "time" + + "github.com/lyft/flinkk8soperator/pkg/controller/k8" + "github.com/lyft/flytestdlib/contextutils" + "github.com/lyft/flytestdlib/logger" + v1 "k8s.io/api/apps/v1" + coreV1 "k8s.io/api/core/v1" + metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/controller-runtime/pkg/source" +) + +// ReconcileFlinkApplication reconciles a FlinkApplication resource +type ReconcileFlinkApplication struct { + client client.Client + cache cache.Cache + metrics *reconcilerMetrics + flinkStateMachine FlinkHandlerInterface +} + +type reconcilerMetrics struct { + scope promutils.Scope + cacheHit labeled.Counter + cacheMiss labeled.Counter +} + +func newReconcilerMetrics(scope promutils.Scope) *reconcilerMetrics { + reconcilerScope := scope.NewSubScope("reconciler") + return &reconcilerMetrics{ + scope: reconcilerScope, + cacheHit: labeled.NewCounter("cache_hit", "Flink application resource fetched from cache", reconcilerScope), + cacheMiss: labeled.NewCounter("cache_miss", "Flink application resource missing from cache", reconcilerScope), + } +} + +func (r *ReconcileFlinkApplication) getResource(ctx context.Context, key types.NamespacedName, obj runtime.Object) error { + err := r.cache.Get(ctx, key, obj) + if err != nil && k8.IsK8sObjectDoesNotExist(err) { + r.metrics.cacheMiss.Inc(ctx) + return r.client.Get(ctx, key, obj) + } + if err == nil { + r.metrics.cacheHit.Inc(ctx) + } + return err +} + +// For failures, we do not want to retry immediately, as we want the underlying resource to recover. +// At the same time, we want to retry faster than the regular success interval. +func (r *ReconcileFlinkApplication) getFailureRetryInterval() time.Duration { + return config.GetConfig().ResyncPeriod.Duration / 2 +} + +func (r *ReconcileFlinkApplication) getReconcileResultForError(err error) reconcile.Result { + if err == nil { + return reconcile.Result{} + } + return reconcile.Result{ + RequeueAfter: r.getFailureRetryInterval(), + } +} + +func (r *ReconcileFlinkApplication) Reconcile(request reconcile.Request) (reconcile.Result, error) { + ctx := context.Background() + ctx = contextutils.WithNamespace(ctx, request.Namespace) + ctx = contextutils.WithAppName(ctx, request.Name) + typeMeta := metaV1.TypeMeta{ + Kind: v1alpha1.FlinkApplicationKind, + APIVersion: v1alpha1.SchemeGroupVersion.String(), + } + // Fetch the FlinkApplication instance + instance := &v1alpha1.FlinkApplication{ + TypeMeta: typeMeta, + } + + err := r.getResource(ctx, request.NamespacedName, instance) + if err != nil { + if k8.IsK8sObjectDoesNotExist(err) { + // Request object not found, could have been deleted after reconcile request. + // Return and don't requeue + return reconcile.Result{}, nil + } + // Error reading the object - we will check again in next loop + return r.getReconcileResultForError(err), nil + } + // We are seeing instances where getResource is removing TypeMeta + instance.TypeMeta = typeMeta + ctx = contextutils.WithPhase(ctx, string(instance.Status.Phase)) + err = r.flinkStateMachine.Handle(ctx, instance) + if err != nil { + logger.Warnf(ctx, "Failed to reconcile resource %v: %v", request.NamespacedName, err) + } + return r.getReconcileResultForError(err), err +} + +// Add creates a new FlinkApplication Controller and adds it to the Manager. The Manager will set fields on the Controller +// and Start it when the Manager is Started. +func Add(ctx context.Context, mgr manager.Manager, cfg config.RuntimeConfig) error { + k8sCluster := k8.NewK8Cluster(mgr) + flinkStateMachine := NewFlinkStateMachine(k8sCluster, cfg) + + metrics := newReconcilerMetrics(cfg.MetricsScope) + reconciler := ReconcileFlinkApplication{ + client: mgr.GetClient(), + cache: mgr.GetCache(), + metrics: metrics, + flinkStateMachine: flinkStateMachine, + } + + c, err := controller.New("flinkAppController", mgr, controller.Options{ + MaxConcurrentReconciles: config.GetConfig().Workers, + Reconciler: &reconciler, + }) + + if err != nil { + return err + } + + if err = c.Watch(&source.Kind{Type: &v1alpha1.FlinkApplication{}}, &handler.EnqueueRequestForObject{}); err != nil { + return err + } + + // Watch deployments and services for the application + if err := c.Watch(&source.Kind{Type: &v1.Deployment{}}, &handler.Funcs{}, getPredicateFuncs()); err != nil { + return err + } + + if err := c.Watch(&source.Kind{Type: &coreV1.Service{}}, &handler.Funcs{}, getPredicateFuncs()); err != nil { + return err + } + return nil +} + +func isOwnedByFlinkApplication(ownerReferences []metaV1.OwnerReference) bool { + for _, ownerReference := range ownerReferences { + if ownerReference.APIVersion == v1alpha1.SchemeGroupVersion.String() && + ownerReference.Kind == v1alpha1.FlinkApplicationKind { + return true + } + } + return false +} + +// Predicate filters events before enqueuing the keys. +// We are only interested in kubernetes objects that are owned by the FlinkApplication +// This filters all the objects not owned by the flinkApplication, and ensures only subset reaches event handlers +func getPredicateFuncs() predicate.Funcs { + return predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return isOwnedByFlinkApplication(e.Meta.GetOwnerReferences()) + }, + UpdateFunc: func(e event.UpdateEvent) bool { + return isOwnedByFlinkApplication(e.MetaNew.GetOwnerReferences()) + }, + DeleteFunc: func(e event.DeleteEvent) bool { + return isOwnedByFlinkApplication(e.Meta.GetOwnerReferences()) + }, + GenericFunc: func(e event.GenericEvent) bool { + return isOwnedByFlinkApplication(e.Meta.GetOwnerReferences()) + }, + } +} diff --git a/pkg/controller/flinkapplication/flink_state_machine.go b/pkg/controller/flinkapplication/flink_state_machine.go new file mode 100644 index 00000000..a1ed1e45 --- /dev/null +++ b/pkg/controller/flinkapplication/flink_state_machine.go @@ -0,0 +1,593 @@ +package flinkapplication + +import ( + "context" + "time" + + "github.com/pkg/errors" + + "fmt" + + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "github.com/lyft/flinkk8soperator/pkg/controller/config" + "github.com/lyft/flinkk8soperator/pkg/controller/flink" + "github.com/lyft/flinkk8soperator/pkg/controller/flink/client" + "github.com/lyft/flinkk8soperator/pkg/controller/k8" + "github.com/lyft/flytestdlib/logger" + "github.com/lyft/flytestdlib/promutils" + "github.com/lyft/flytestdlib/promutils/labeled" + corev1 "k8s.io/api/core/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/clock" +) + +const ( + jobFinalizer = "job.finalizers.flink.k8s.io" +) + +// The core state machine that manages Flink clusters and jobs. See docs/state_machine.md for a description of the +// states and transitions. +type FlinkHandlerInterface interface { + Handle(ctx context.Context, application *v1alpha1.FlinkApplication) error +} + +type FlinkStateMachine struct { + flinkController flink.ControllerInterface + k8Cluster k8.ClusterInterface + clock clock.Clock + metrics *stateMachineMetrics +} + +type stateMachineMetrics struct { + scope promutils.Scope + stateMachineHandlePhaseMap map[v1alpha1.FlinkApplicationPhase]labeled.StopWatch + stateMachineHandleSuccessPhaseMap map[v1alpha1.FlinkApplicationPhase]labeled.StopWatch + errorCounterPhaseMap map[v1alpha1.FlinkApplicationPhase]labeled.Counter +} + +func newStateMachineMetrics(scope promutils.Scope) *stateMachineMetrics { + stateMachineScope := scope.NewSubScope("state_machine") + stateMachineHandlePhaseMap := map[v1alpha1.FlinkApplicationPhase]labeled.StopWatch{} + stateMachineHandleSuccessPhaseMap := map[v1alpha1.FlinkApplicationPhase]labeled.StopWatch{} + errorCounterPhaseMap := map[v1alpha1.FlinkApplicationPhase]labeled.Counter{} + + for _, phase := range v1alpha1.FlinkApplicationPhases { + phaseName := phase.VerboseString() + stateMachineHandleSuccessPhaseMap[phase] = labeled.NewStopWatch(phaseName+"_"+"handle_time_success", + fmt.Sprintf("Total time to handle the %s application state on success", phaseName), time.Millisecond, stateMachineScope) + stateMachineHandlePhaseMap[phase] = labeled.NewStopWatch(phaseName+"_"+"handle_time", + fmt.Sprintf("Total time to handle the %s application state", phaseName), time.Millisecond, stateMachineScope) + errorCounterPhaseMap[phase] = labeled.NewCounter(phaseName+"_"+"error", + fmt.Sprintf("Failure to handle the %s application state", phaseName), stateMachineScope) + } + return &stateMachineMetrics{ + scope: scope, + stateMachineHandlePhaseMap: stateMachineHandlePhaseMap, + stateMachineHandleSuccessPhaseMap: stateMachineHandleSuccessPhaseMap, + errorCounterPhaseMap: errorCounterPhaseMap, + } +} + +func (s *FlinkStateMachine) updateApplicationPhase(ctx context.Context, application *v1alpha1.FlinkApplication, phase v1alpha1.FlinkApplicationPhase) error { + application.Status.Phase = phase + now := v1.NewTime(s.clock.Now()) + application.Status.LastUpdatedAt = &now + + return s.k8Cluster.UpdateK8Object(ctx, application) +} + +func (s *FlinkStateMachine) shouldRollback(ctx context.Context, application *v1alpha1.FlinkApplication) bool { + if application.Status.DeployHash == "" { + // TODO: we may want some more sophisticated way of handling this case + // there's no previous deploy for this application, so nothing to roll back to + return false + } + + appLastUpdated := application.Status.LastUpdatedAt + if appLastUpdated != nil && !v1alpha1.IsRunningPhase(application.Status.Phase) { + elapsedTime := s.clock.Since(appLastUpdated.Time) + if s.getStalenessDuration() > 0 && elapsedTime > s.getStalenessDuration() { + s.flinkController.LogEvent(ctx, application, "", corev1.EventTypeWarning, fmt.Sprintf("Application failed to progress for %v in the %v phase", + elapsedTime, application.Status.Phase)) + + return true + } + } + return false +} + +func (s *FlinkStateMachine) Handle(ctx context.Context, application *v1alpha1.FlinkApplication) error { + currentPhase := application.Status.Phase + timer := s.metrics.stateMachineHandlePhaseMap[currentPhase].Start(ctx) + successTimer := s.metrics.stateMachineHandleSuccessPhaseMap[currentPhase].Start(ctx) + + defer timer.Stop() + err := s.handle(ctx, application) + if err != nil { + s.metrics.errorCounterPhaseMap[currentPhase].Inc(ctx) + } else { + successTimer.Stop() + } + return err +} + +func (s *FlinkStateMachine) handle(ctx context.Context, application *v1alpha1.FlinkApplication) error { + if !application.ObjectMeta.DeletionTimestamp.IsZero() && application.Status.Phase != v1alpha1.FlinkApplicationDeleting { + // Always perform a single application update per callback + return s.updateApplicationPhase(ctx, application, v1alpha1.FlinkApplicationDeleting) + } + + if !v1alpha1.IsRunningPhase(application.Status.Phase) { + logger.Infof(ctx, "Handling state %s for application", application.Status.Phase) + } + + switch application.Status.Phase { + case v1alpha1.FlinkApplicationNew, v1alpha1.FlinkApplicationUpdating: + // Currently just transitions to the next state + return s.handleNewOrUpdating(ctx, application) + case v1alpha1.FlinkApplicationClusterStarting: + return s.handleClusterStarting(ctx, application) + case v1alpha1.FlinkApplicationSubmittingJob: + return s.handleSubmittingJob(ctx, application) + case v1alpha1.FlinkApplicationRunning, v1alpha1.FlinkApplicationDeployFailed: + return s.handleApplicationRunning(ctx, application) + case v1alpha1.FlinkApplicationSavepointing: + return s.handleApplicationSavepointing(ctx, application) + case v1alpha1.FlinkApplicationRollingBackJob: + return s.handleRollingBack(ctx, application) + case v1alpha1.FlinkApplicationDeleting: + return s.handleApplicationDeleting(ctx, application) + } + return nil +} + +// In this state we create a new cluster, either due to an entirely new FlinkApplication or due to an update. +func (s *FlinkStateMachine) handleNewOrUpdating(ctx context.Context, application *v1alpha1.FlinkApplication) error { + // TODO: add up-front validation on the FlinkApplication resource + if s.shouldRollback(ctx, application) { + // we've failed to make progress; move to deploy failed + return s.deployFailed(ctx, application) + } + + // Create the Flink cluster + err := s.flinkController.CreateCluster(ctx, application) + if err != nil { + logger.Errorf(ctx, "Cluster creation failed with error: %v", err) + return err + } + + return s.updateApplicationPhase(ctx, application, v1alpha1.FlinkApplicationClusterStarting) +} + +func (s *FlinkStateMachine) deployFailed(ctx context.Context, app *v1alpha1.FlinkApplication) error { + s.flinkController.LogEvent(ctx, app, "", corev1.EventTypeWarning, "Deployment failed, rolled back successfully") + app.Status.FailedDeployHash = flink.HashForApplication(app) + + return s.updateApplicationPhase(ctx, app, v1alpha1.FlinkApplicationDeployFailed) +} + +// Create the underlying Kubernetes objects for the new cluster +func (s *FlinkStateMachine) handleClusterStarting(ctx context.Context, application *v1alpha1.FlinkApplication) error { + if s.shouldRollback(ctx, application) { + // we've failed to make progress; move to deploy failed + // TODO: this will need different logic in single mode + return s.deployFailed(ctx, application) + } + + // Wait for all to be running + ready, err := s.flinkController.IsClusterReady(ctx, application) + if err != nil { + return err + } + if !ready { + return nil + } + + logger.Infof(ctx, "Flink cluster has started successfully") + // TODO: in single mode move to submitting job + return s.updateApplicationPhase(ctx, application, v1alpha1.FlinkApplicationSavepointing) +} + +func (s *FlinkStateMachine) handleApplicationSavepointing(ctx context.Context, application *v1alpha1.FlinkApplication) error { + // we've already savepointed (or this is our first deploy), continue on + if application.Spec.SavepointInfo.SavepointLocation != "" || application.Status.DeployHash == "" { + return s.updateApplicationPhase(ctx, application, v1alpha1.FlinkApplicationSubmittingJob) + } + + // we haven't started savepointing yet; do so now + // TODO: figure out the idempotence of this + if application.Spec.SavepointInfo.TriggerID == "" { + if s.shouldRollback(ctx, application) { + // we were unable to start savepointing for our failure period, so roll back + // TODO: we should think about how to handle the case where the cluster has started savepointing, but does + // not finish within some time frame. Currently, we just wait indefinitely for the JM to report its + // status. It's not clear what the right answer is. + return s.deployFailed(ctx, application) + } + + triggerID, err := s.flinkController.CancelWithSavepoint(ctx, application, application.Status.DeployHash) + if err != nil { + return err + } + + s.flinkController.LogEvent(ctx, application, "", corev1.EventTypeNormal, fmt.Sprintf("Cancelling job %s with a final savepoint", application.Status.JobStatus.JobID)) + + application.Spec.SavepointInfo.TriggerID = triggerID + return s.k8Cluster.UpdateK8Object(ctx, application) + } + + // check the savepoints in progress + savepointStatusResponse, err := s.flinkController.GetSavepointStatus(ctx, application, application.Status.DeployHash) + if err != nil { + return err + } + + var restorePath string + if savepointStatusResponse.Operation.Location == "" && + savepointStatusResponse.SavepointStatus.Status != client.SavePointInProgress { + // Savepointing failed + // TODO: we should probably retry this a few times before failing + s.flinkController.LogEvent(ctx, application, "", corev1.EventTypeWarning, fmt.Sprintf("Failed to take savepoint: %v", + savepointStatusResponse.Operation.FailureCause)) + + // try to find an externalized checkpoint + path, err := s.flinkController.FindExternalizedCheckpoint(ctx, application, application.Status.DeployHash) + if err != nil { + logger.Infof(ctx, "error while fetching externalized checkpoint path: %v", err) + return s.deployFailed(ctx, application) + } else if path == "" { + logger.Infof(ctx, "no externalized checkpoint found") + return s.deployFailed(ctx, application) + } + + s.flinkController.LogEvent(ctx, application, "", corev1.EventTypeNormal, fmt.Sprintf("Restoring from externalized checkpoint %s", path)) + + restorePath = path + } else if savepointStatusResponse.SavepointStatus.Status == client.SavePointCompleted { + s.flinkController.LogEvent(ctx, application, "", corev1.EventTypeNormal, fmt.Sprintf("Canceled job with savepoint %s", + savepointStatusResponse.Operation.Location)) + restorePath = savepointStatusResponse.Operation.Location + } + + if restorePath != "" { + application.Spec.SavepointInfo.SavepointLocation = restorePath + return s.updateApplicationPhase(ctx, application, v1alpha1.FlinkApplicationSubmittingJob) + } + + return nil +} + +func (s *FlinkStateMachine) submitJobIfNeeded(ctx context.Context, app *v1alpha1.FlinkApplication, hash string, + jarName string, parallelism int32, entryClass string, programArgs string) (*client.FlinkJob, error) { + isReady, _ := s.flinkController.IsServiceReady(ctx, app, hash) + // Ignore errors + if !isReady { + return nil, nil + } + + // add the job running finalizer if necessary + if err := s.addFinalizerIfMissing(ctx, app, jobFinalizer); err != nil { + return nil, err + } + + // Check that there are no jobs running before starting the job + jobs, err := s.flinkController.GetJobsForApplication(ctx, app, hash) + if err != nil { + return nil, err + } + + // TODO: check if there are multiple active jobs + activeJob := flink.GetActiveFlinkJob(jobs) + if activeJob == nil { + logger.Infof(ctx, "No active job found for the application %v", jobs) + jobID, err := s.flinkController.StartFlinkJob(ctx, app, hash, + jarName, parallelism, entryClass, programArgs) + if err != nil { + s.flinkController.LogEvent(ctx, app, "", corev1.EventTypeWarning, fmt.Sprintf("Failed to submit job to cluster: %v", err)) + + // TODO: we probably want some kind of back-off here + return nil, err + } + + s.flinkController.LogEvent(ctx, app, "", corev1.EventTypeNormal, fmt.Sprintf("Flink job submitted to cluster with id %s", jobID)) + app.Status.JobStatus.JobID = jobID + activeJob = flink.GetActiveFlinkJob(jobs) + } else { + app.Status.JobStatus.JobID = activeJob.JobID + } + + return activeJob, nil +} + +func (s *FlinkStateMachine) updateGenericService(ctx context.Context, app *v1alpha1.FlinkApplication, newHash string) error { + service, err := s.k8Cluster.GetService(ctx, app.Namespace, app.Name) + if err != nil { + return err + } + if service == nil { + // this is bad... if the service is somehow deleted between the previous call to CreateCluster and here + // recovery will not be possible + // TODO: handle this case better + return errors.New("service does not exist") + } + + if service.Spec.Selector[flink.FlinkAppHash] != newHash { + // the service hasn't yet been updated + service.Spec.Selector[flink.FlinkAppHash] = newHash + err = s.k8Cluster.UpdateK8Object(ctx, service) + if err != nil { + return err + } + } + + return nil +} + +func (s *FlinkStateMachine) handleSubmittingJob(ctx context.Context, app *v1alpha1.FlinkApplication) error { + if s.shouldRollback(ctx, app) { + // Something's gone wrong; roll back + return s.updateApplicationPhase(ctx, app, v1alpha1.FlinkApplicationRollingBackJob) + } + + // switch the service to point to the new jobmanager + hash := flink.HashForApplication(app) + err := s.updateGenericService(ctx, app, hash) + if err != nil { + return err + } + + activeJob, err := s.submitJobIfNeeded(ctx, app, hash, + app.Spec.JarName, app.Spec.Parallelism, app.Spec.EntryClass, app.Spec.ProgramArgs) + if err != nil { + return err + } + + if activeJob != nil && activeJob.Status == client.Running { + // Clear the savepoint info + app.Spec.SavepointInfo = v1alpha1.SavepointInfo{} + // Update the application status with the running job info + app.Status.DeployHash = hash + app.Status.JobStatus.JarName = app.Spec.JarName + app.Status.JobStatus.Parallelism = app.Spec.Parallelism + app.Status.JobStatus.EntryClass = app.Spec.EntryClass + app.Status.JobStatus.ProgramArgs = app.Spec.ProgramArgs + + return s.updateApplicationPhase(ctx, app, v1alpha1.FlinkApplicationRunning) + } + + return nil +} + +// Something has gone wrong during the update, post job-cancellation (and cluster tear-down in single mode). We need +// to try to get things back into a working state +func (s *FlinkStateMachine) handleRollingBack(ctx context.Context, app *v1alpha1.FlinkApplication) error { + if s.shouldRollback(ctx, app) { + // we've failed in our roll back attempt (presumably because something's now wrong with the original cluster) + // move immediately to the DeployFailed state so that the user can recover. + return s.deployFailed(ctx, app) + } + + s.flinkController.LogEvent(ctx, app, "", corev1.EventTypeWarning, "Deployment failed, rolling back") + + // TODO: handle single mode + + // TODO: it's possible that a job is successfully running in the new cluster at this point -- should cancel it + // so that we never have two jobs running at once. + + // update the service to point back to the old deployment if needed + err := s.updateGenericService(ctx, app, app.Status.DeployHash) + if err != nil { + return err + } + + // wait until the service is ready + isReady, _ := s.flinkController.IsServiceReady(ctx, app, app.Status.DeployHash) + // Ignore errors + if !isReady { + return nil + } + + // submit the old job + activeJob, err := s.submitJobIfNeeded(ctx, app, app.Status.DeployHash, + app.Status.JobStatus.JarName, app.Status.JobStatus.Parallelism, + app.Status.JobStatus.EntryClass, app.Status.JobStatus.ProgramArgs) + + if err != nil { + return err + } + + if activeJob != nil { + app.Spec.SavepointInfo = v1alpha1.SavepointInfo{} + // move to the deploy failed state + return s.deployFailed(ctx, app) + } + + return nil +} + +// Check if the application is Running. +// This is a stable state. Keep monitoring if the underlying CRD reflects the Flink cluster +func (s *FlinkStateMachine) handleApplicationRunning(ctx context.Context, application *v1alpha1.FlinkApplication) error { + jobs, err := s.flinkController.GetJobsForApplication(ctx, application, application.Status.DeployHash) + if err != nil { + // TODO: think more about this case + return err + } + + // The jobid in Flink can change if there is a Job manager failover. + // The Operator needs to update its state with the right value. + // In the Running state, there must be a job already started in the cluster. + activeJob := flink.GetActiveFlinkJob(jobs) + if activeJob != nil { + application.Status.JobStatus.JobID = activeJob.JobID + } + + logger.Debugf(ctx, "Application running with job %v", activeJob) + + cur, old, err := s.flinkController.GetCurrentAndOldDeploymentsForApp(ctx, application) + if err != nil { + return err + } + + // If the application has changed (i.e., there are no current deployments), and we haven't already failed trying to + // do the update, move to the cluster starting phase to create the new cluster + if cur == nil { + logger.Infof(ctx, "Application resource has changed. Moving to Updating") + // TODO: handle single mode + return s.updateApplicationPhase(ctx, application, v1alpha1.FlinkApplicationUpdating) + } + + // If there are old deployments left-over from a previous version, clean them up + for _, fd := range old { + s.flinkController.LogEvent(ctx, application, "", corev1.EventTypeNormal, fmt.Sprintf("Deleting old cluster with hash %s", fd.Hash)) + err := s.flinkController.DeleteCluster(ctx, application, fd.Hash) + if err != nil { + return err + } + } + + // Update status of the cluster + hasClusterStatusChanged, clusterErr := s.flinkController.CompareAndUpdateClusterStatus(ctx, application, application.Status.DeployHash) + if clusterErr != nil { + logger.Errorf(ctx, "Updating cluster status failed with %v", clusterErr) + } + + // Update status of jobs on the cluster + hasJobStatusChanged, jobsErr := s.flinkController.CompareAndUpdateJobStatus(ctx, application, application.Status.DeployHash) + if jobsErr != nil { + logger.Errorf(ctx, "Updating jobs status failed with %v", jobsErr) + } + + // Update k8s object if either job or cluster status has changed + if hasJobStatusChanged || hasClusterStatusChanged { + return s.k8Cluster.UpdateK8Object(ctx, application) + } + + return nil +} + +func (s *FlinkStateMachine) getStalenessDuration() time.Duration { + return config.GetConfig().StatemachineStalenessDuration.Duration +} + +func (s *FlinkStateMachine) addFinalizerIfMissing(ctx context.Context, application *v1alpha1.FlinkApplication, finalizer string) error { + for _, f := range application.Finalizers { + if f == finalizer { + return nil + } + } + + // finalizer not present; add + application.Finalizers = append(application.Finalizers, finalizer) + return s.k8Cluster.UpdateK8Object(ctx, application) +} + +func removeString(list []string, target string) []string { + ret := make([]string, 0) + for _, s := range list { + if s != target { + ret = append(ret, s) + } + } + + return ret +} + +func (s *FlinkStateMachine) clearFinalizers(ctx context.Context, app *v1alpha1.FlinkApplication) error { + app.Finalizers = removeString(app.Finalizers, jobFinalizer) + return s.k8Cluster.UpdateK8Object(ctx, app) +} + +func jobFinished(jobs []client.FlinkJob, id string) bool { + for _, job := range jobs { + if job.JobID == id { + return job.Status == client.Canceled || + job.Status == client.Failed || + job.Status == client.Finished + } + } + + return true +} + +func (s *FlinkStateMachine) handleApplicationDeleting(ctx context.Context, app *v1alpha1.FlinkApplication) error { + // There should be a way for the user to force deletion (e.g., if the job is failing and they can't + // savepoint). However, this seems dangerous to do automatically. + // If https://github.com/kubernetes/kubernetes/issues/56567 is fixed users will be able to use + // kubectl delete --force, but for now they will need to update the DeleteMode. + + if app.Spec.DeleteMode == v1alpha1.DeleteModeNone { + // just delete the finalizer so the cluster can be torn down + return s.clearFinalizers(ctx, app) + } + + jobs, err := s.flinkController.GetJobsForApplication(ctx, app, app.Status.DeployHash) + if err != nil { + return err + } + + finished := jobFinished(jobs, app.Status.JobStatus.JobID) + switch app.Spec.DeleteMode { + case v1alpha1.DeleteModeForceCancel: + if finished { + // the job has already been cancelled, so clear the finalizer + return s.clearFinalizers(ctx, app) + } + + logger.Infof(ctx, "Force cancelling job as part of cleanup") + return s.flinkController.ForceCancel(ctx, app, app.Status.DeployHash) + case v1alpha1.DeleteModeSavepoint, "": + if app.Spec.SavepointInfo.SavepointLocation != "" { + if finished { + return s.clearFinalizers(ctx, app) + } + // we've already created the savepoint, now just waiting for the job to be cancelled + return nil + } + + if app.Spec.SavepointInfo.TriggerID == "" { + // delete with savepoint + triggerID, err := s.flinkController.CancelWithSavepoint(ctx, app, app.Status.DeployHash) + if err != nil { + return err + } + s.flinkController.LogEvent(ctx, app, "", corev1.EventTypeNormal, fmt.Sprintf("Cancelling job with savepoint %v", triggerID)) + app.Spec.SavepointInfo.TriggerID = triggerID + } else { + // we've already started savepointing; check the status + status, err := s.flinkController.GetSavepointStatus(ctx, app, app.Status.DeployHash) + if err != nil { + return err + } + + if status.Operation.Location == "" && status.SavepointStatus.Status != client.SavePointInProgress { + // savepointing failed + s.flinkController.LogEvent(ctx, app, "", corev1.EventTypeWarning, fmt.Sprintf("Failed to take savepoint %v", status.Operation.FailureCause)) + // clear the trigger id so that we can try again + app.Spec.SavepointInfo.TriggerID = "" + } else if status.SavepointStatus.Status == client.SavePointCompleted { + // we're done, clean up + s.flinkController.LogEvent(ctx, app, "", corev1.EventTypeNormal, fmt.Sprintf("Cancelled job with savepoint '%s'", status.Operation.Location)) + app.Spec.SavepointInfo.SavepointLocation = status.Operation.Location + app.Spec.SavepointInfo.TriggerID = "" + } + } + + return s.k8Cluster.UpdateK8Object(ctx, app) + default: + logger.Errorf(ctx, "Unsupported DeleteMode %s", app.Spec.DeleteMode) + } + + return nil +} + +func NewFlinkStateMachine(k8sCluster k8.ClusterInterface, config config.RuntimeConfig) FlinkHandlerInterface { + + metrics := newStateMachineMetrics(config.MetricsScope) + return &FlinkStateMachine{ + k8Cluster: k8sCluster, + flinkController: flink.NewController(k8sCluster, config), + clock: clock.RealClock{}, + metrics: metrics, + } +} diff --git a/pkg/controller/flinkapplication/flink_state_machine_test.go b/pkg/controller/flinkapplication/flink_state_machine_test.go new file mode 100644 index 00000000..13d513c0 --- /dev/null +++ b/pkg/controller/flinkapplication/flink_state_machine_test.go @@ -0,0 +1,850 @@ +package flinkapplication + +import ( + "context" + "testing" + "time" + + controller_config "github.com/lyft/flinkk8soperator/pkg/controller/config" + "github.com/lyft/flinkk8soperator/pkg/controller/flink" + "github.com/lyft/flinkk8soperator/pkg/controller/flink/client" + "github.com/lyft/flytestdlib/config" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + "github.com/lyft/flinkk8soperator/pkg/controller/common" + "github.com/lyft/flinkk8soperator/pkg/controller/flink/mock" + k8mock "github.com/lyft/flinkk8soperator/pkg/controller/k8/mock" + mockScope "github.com/lyft/flytestdlib/promutils" + "github.com/lyft/flytestdlib/promutils/labeled" + "github.com/stretchr/testify/assert" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/clock" +) + +const testSavepointLocation = "location" + +func getTestStateMachine() FlinkStateMachine { + testScope := mockScope.NewTestScope() + labeled.SetMetricKeys(common.GetValidLabelNames()...) + + return FlinkStateMachine{ + flinkController: &mock.FlinkController{}, + k8Cluster: &k8mock.K8Cluster{}, + clock: &clock.FakeClock{}, + metrics: newStateMachineMetrics(testScope), + } +} + +func testFlinkDeployment(app *v1alpha1.FlinkApplication) common.FlinkDeployment { + hash := flink.HashForApplication(app) + return common.FlinkDeployment{ + Jobmanager: flink.FetchJobMangerDeploymentCreateObj(app, hash), + Taskmanager: flink.FetchTaskMangerDeploymentCreateObj(app, hash), + Hash: hash, + } +} + +func TestHandleNewOrCreate(t *testing.T) { + stateMachineForTest := getTestStateMachine() + + mockK8Cluster := stateMachineForTest.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.UpdateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + application := object.(*v1alpha1.FlinkApplication) + assert.Equal(t, v1alpha1.FlinkApplicationClusterStarting, application.Status.Phase) + return nil + } + + err := stateMachineForTest.Handle(context.Background(), &v1alpha1.FlinkApplication{ + Spec: v1alpha1.FlinkApplicationSpec{}, + }) + assert.Nil(t, err) +} + +func TestHandleStartingClusterStarting(t *testing.T) { + stateMachineForTest := getTestStateMachine() + mockFlinkController := stateMachineForTest.flinkController.(*mock.FlinkController) + mockFlinkController.IsClusterReadyFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) { + return false, nil + } + + mockK8Cluster := stateMachineForTest.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.UpdateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + assert.False(t, true) + return nil + } + err := stateMachineForTest.Handle(context.Background(), &v1alpha1.FlinkApplication{ + Status: v1alpha1.FlinkApplicationStatus{ + Phase: v1alpha1.FlinkApplicationClusterStarting, + }, + }) + assert.Nil(t, err) +} + +func TestHandleStartingDual(t *testing.T) { + updateInvoked := false + stateMachineForTest := getTestStateMachine() + mockFlinkController := stateMachineForTest.flinkController.(*mock.FlinkController) + mockFlinkController.IsClusterReadyFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication) (bool, error) { + return true, nil + } + + mockFlinkController.GetCurrentAndOldDeploymentsForAppFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication) (*common.FlinkDeployment, []common.FlinkDeployment, error) { + fd := testFlinkDeployment(application) + fd.Taskmanager.Status.AvailableReplicas = 2 + fd.Jobmanager.Status.AvailableReplicas = 1 + return &fd, nil, nil + } + + mockK8Cluster := stateMachineForTest.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.UpdateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + application := object.(*v1alpha1.FlinkApplication) + assert.Equal(t, v1alpha1.FlinkApplicationSavepointing, application.Status.Phase) + updateInvoked = true + return nil + } + err := stateMachineForTest.Handle(context.Background(), &v1alpha1.FlinkApplication{ + Status: v1alpha1.FlinkApplicationStatus{ + Phase: v1alpha1.FlinkApplicationClusterStarting, + }, + }) + assert.True(t, updateInvoked) + assert.Nil(t, err) +} + +func TestHandleApplicationSavepointingInitialDeploy(t *testing.T) { + // on the initial deploy we should skip savepointing and go straight to SubmittingJob + updateInvoked := false + stateMachineForTest := getTestStateMachine() + + mockFlinkController := stateMachineForTest.flinkController.(*mock.FlinkController) + mockFlinkController.CancelWithSavepointFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (s string, e error) { + // should not be called + assert.False(t, true) + return "", nil + } + + mockK8Cluster := stateMachineForTest.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.UpdateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + application := object.(*v1alpha1.FlinkApplication) + assert.Equal(t, v1alpha1.FlinkApplicationSubmittingJob, application.Status.Phase) + updateInvoked = true + return nil + } + + err := stateMachineForTest.Handle(context.Background(), &v1alpha1.FlinkApplication{ + Status: v1alpha1.FlinkApplicationStatus{ + Phase: v1alpha1.FlinkApplicationSavepointing, + }, + }) + assert.True(t, updateInvoked) + assert.Nil(t, err) +} + +func TestHandleApplicationSavepointingDual(t *testing.T) { + app := v1alpha1.FlinkApplication{ + Status: v1alpha1.FlinkApplicationStatus{ + Phase: v1alpha1.FlinkApplicationSavepointing, + DeployHash: "old-hash", + }, + } + + cancelInvoked := false + stateMachineForTest := getTestStateMachine() + mockFlinkController := stateMachineForTest.flinkController.(*mock.FlinkController) + + mockFlinkController.CancelWithSavepointFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (s string, e error) { + assert.Equal(t, "old-hash", hash) + cancelInvoked = true + + return "trigger", nil + } + + mockFlinkController.GetSavepointStatusFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (*client.SavepointResponse, error) { + assert.Equal(t, "old-hash", hash) + return &client.SavepointResponse{ + SavepointStatus: client.SavepointStatusResponse{ + Status: client.SavePointCompleted, + }, + Operation: client.SavepointOperationResponse{ + Location: testSavepointLocation, + }, + }, nil + } + + mockK8Cluster := stateMachineForTest.k8Cluster.(*k8mock.K8Cluster) + updateCount := 0 + mockK8Cluster.UpdateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + application := object.(*v1alpha1.FlinkApplication) + if updateCount == 0 { + assert.Equal(t, "trigger", application.Spec.SavepointInfo.TriggerID) + } else { + assert.Equal(t, testSavepointLocation, application.Spec.SavepointInfo.SavepointLocation) + assert.Equal(t, v1alpha1.FlinkApplicationSubmittingJob, application.Status.Phase) + } + + updateCount++ + return nil + } + + err := stateMachineForTest.Handle(context.Background(), &app) + assert.Nil(t, err) + + err = stateMachineForTest.Handle(context.Background(), &app) + assert.Nil(t, err) + + assert.Equal(t, updateCount, 2) + assert.True(t, cancelInvoked) + assert.Nil(t, err) +} + +func TestHandleApplicationSavepointingFailed(t *testing.T) { + updateInvoked := false + stateMachineForTest := getTestStateMachine() + mockFlinkController := stateMachineForTest.flinkController.(*mock.FlinkController) + mockFlinkController.GetSavepointStatusFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (*client.SavepointResponse, error) { + return &client.SavepointResponse{ + SavepointStatus: client.SavepointStatusResponse{ + Status: client.SavePointCompleted, + }, + }, nil + } + + app := v1alpha1.FlinkApplication{ + Spec: v1alpha1.FlinkApplicationSpec{ + SavepointInfo: v1alpha1.SavepointInfo{ + TriggerID: "trigger", + }, + }, + Status: v1alpha1.FlinkApplicationStatus{ + Phase: v1alpha1.FlinkApplicationSavepointing, + DeployHash: "blah", + }, + } + hash := flink.HashForApplication(&app) + + mockK8Cluster := stateMachineForTest.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.UpdateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + application := object.(*v1alpha1.FlinkApplication) + assert.Empty(t, application.Spec.SavepointInfo.SavepointLocation) + assert.Equal(t, hash, application.Status.FailedDeployHash) + assert.Equal(t, v1alpha1.FlinkApplicationDeployFailed, application.Status.Phase) + updateInvoked = true + return nil + } + err := stateMachineForTest.Handle(context.Background(), &app) + assert.True(t, updateInvoked) + assert.Nil(t, err) +} + +func TestRestoreFromExternalizedCheckpoint(t *testing.T) { + updateInvoked := false + + app := v1alpha1.FlinkApplication{ + Spec: v1alpha1.FlinkApplicationSpec{ + SavepointInfo: v1alpha1.SavepointInfo{ + TriggerID: "trigger", + }, + }, + Status: v1alpha1.FlinkApplicationStatus{ + Phase: v1alpha1.FlinkApplicationSavepointing, + DeployHash: "blah", + }, + } + + stateMachineForTest := getTestStateMachine() + mockFlinkController := stateMachineForTest.flinkController.(*mock.FlinkController) + mockFlinkController.GetSavepointStatusFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (*client.SavepointResponse, error) { + return &client.SavepointResponse{ + SavepointStatus: client.SavepointStatusResponse{ + Status: client.SavePointCompleted, + }, + }, nil + } + + mockFlinkController.FindExternalizedCheckpointFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (string, error) { + return "/tmp/checkpoint", nil + } + + mockK8Cluster := stateMachineForTest.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.UpdateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + application := object.(*v1alpha1.FlinkApplication) + assert.Equal(t, "/tmp/checkpoint", application.Spec.SavepointInfo.SavepointLocation) + assert.Equal(t, v1alpha1.FlinkApplicationSubmittingJob, application.Status.Phase) + updateInvoked = true + return nil + } + err := stateMachineForTest.Handle(context.Background(), &app) + assert.True(t, updateInvoked) + assert.Nil(t, err) +} + +func TestSubmittingToRunning(t *testing.T) { + jobID := "j1" + + app := v1alpha1.FlinkApplication{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-app", + Namespace: "flink", + }, + Spec: v1alpha1.FlinkApplicationSpec{ + JarName: "job.jar", + Parallelism: 5, + EntryClass: "com.my.Class", + ProgramArgs: "--test", + }, + Status: v1alpha1.FlinkApplicationStatus{ + Phase: v1alpha1.FlinkApplicationSubmittingJob, + DeployHash: "old-hash", + }, + } + appHash := flink.HashForApplication(&app) + + stateMachineForTest := getTestStateMachine() + mockFlinkController := stateMachineForTest.flinkController.(*mock.FlinkController) + mockFlinkController.IsServiceReadyFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (bool, error) { + return true, nil + } + + getCount := 0 + mockFlinkController.GetJobsForApplicationFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) ([]client.FlinkJob, error) { + assert.Equal(t, appHash, hash) + var res []client.FlinkJob + if getCount == 1 { + res = []client.FlinkJob{ + { + JobID: jobID, + Status: client.Running, + }} + } + getCount++ + return res, nil + } + + startCount := 0 + mockFlinkController.StartFlinkJobFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string, + jarName string, parallelism int32, entryClass string, programArgs string) (string, error) { + + assert.Equal(t, appHash, hash) + assert.Equal(t, app.Spec.JarName, jarName) + assert.Equal(t, app.Spec.Parallelism, parallelism) + assert.Equal(t, app.Spec.EntryClass, entryClass) + assert.Equal(t, app.Spec.ProgramArgs, programArgs) + + startCount++ + return jobID, nil + } + + mockK8Cluster := stateMachineForTest.k8Cluster.(*k8mock.K8Cluster) + + getServiceCount := 0 + mockK8Cluster.GetServiceFunc = func(ctx context.Context, namespace string, name string) (*v1.Service, error) { + assert.Equal(t, "flink", namespace) + assert.Equal(t, "test-app", name) + + hash := "old-hash" + if getServiceCount > 0 { + hash = appHash + } + + getServiceCount++ + return &v1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-app", + Namespace: "flink", + }, + Spec: v1.ServiceSpec{ + Selector: map[string]string{ + "flink-app-hash": hash, + }, + }, + }, nil + } + + updateCount := 0 + mockK8Cluster.UpdateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + if updateCount == 0 { + // update to the service + service := object.(*v1.Service) + assert.Equal(t, appHash, service.Spec.Selector["flink-app-hash"]) + } else if updateCount == 1 { + application := object.(*v1alpha1.FlinkApplication) + assert.Equal(t, jobFinalizer, application.Finalizers[0]) + } else if updateCount == 2 { + application := object.(*v1alpha1.FlinkApplication) + assert.Equal(t, jobID, application.Status.JobStatus.JobID) + assert.Equal(t, appHash, application.Status.DeployHash) + assert.Equal(t, app.Spec.JarName, app.Status.JobStatus.JarName) + assert.Equal(t, app.Spec.Parallelism, app.Status.JobStatus.Parallelism) + assert.Equal(t, app.Spec.EntryClass, app.Status.JobStatus.EntryClass) + assert.Equal(t, app.Spec.ProgramArgs, app.Status.JobStatus.ProgramArgs) + assert.Equal(t, v1alpha1.FlinkApplicationRunning, application.Status.Phase) + } + + updateCount++ + return nil + } + + err := stateMachineForTest.Handle(context.Background(), &app) + assert.Nil(t, err) + err = stateMachineForTest.Handle(context.Background(), &app) + assert.Nil(t, err) + + assert.Equal(t, 2, getCount) + assert.Equal(t, 1, startCount) + assert.Equal(t, 3, updateCount) +} + +func TestHandleApplicationNotReady(t *testing.T) { + stateMachineForTest := getTestStateMachine() + mockFlinkController := stateMachineForTest.flinkController.(*mock.FlinkController) + mockFlinkController.IsServiceReadyFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (bool, error) { + return false, nil + } + mockFlinkController.GetJobsForApplicationFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) ([]client.FlinkJob, error) { + assert.False(t, true) + return nil, nil + } + mockFlinkController.StartFlinkJobFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string, + jarName string, parallelism int32, entryClass string, programArgs string) (string, error) { + assert.False(t, true) + return "", nil + } + + app := v1alpha1.FlinkApplication{ + Status: v1alpha1.FlinkApplicationStatus{ + Phase: v1alpha1.FlinkApplicationSubmittingJob, + }, + } + + mockK8Cluster := stateMachineForTest.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.GetServiceFunc = func(ctx context.Context, namespace string, name string) (*v1.Service, error) { + return &v1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-app", + Namespace: "flink", + }, + Spec: v1.ServiceSpec{ + Selector: map[string]string{ + "flink-app-hash": flink.HashForApplication(&app), + }, + }, + }, nil + } + + mockK8Cluster.UpdateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + assert.False(t, true) + return nil + } + err := stateMachineForTest.Handle(context.Background(), &app) + assert.Nil(t, err) +} + +func TestHandleApplicationRunning(t *testing.T) { + stateMachineForTest := getTestStateMachine() + mockFlinkController := stateMachineForTest.flinkController.(*mock.FlinkController) + mockFlinkController.GetCurrentAndOldDeploymentsForAppFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication) (*common.FlinkDeployment, []common.FlinkDeployment, error) { + fd := testFlinkDeployment(application) + return &fd, nil, nil + } + + mockK8Cluster := stateMachineForTest.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.UpdateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + assert.True(t, false) + return nil + } + err := stateMachineForTest.Handle(context.Background(), &v1alpha1.FlinkApplication{ + Status: v1alpha1.FlinkApplicationStatus{ + Phase: v1alpha1.FlinkApplicationRunning, + }, + }) + assert.Nil(t, err) +} + +func TestRunningToClusterStarting(t *testing.T) { + updateInvoked := false + stateMachineForTest := getTestStateMachine() + mockFlinkController := stateMachineForTest.flinkController.(*mock.FlinkController) + mockFlinkController.GetCurrentAndOldDeploymentsForAppFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication) (*common.FlinkDeployment, []common.FlinkDeployment, error) { + return nil, []common.FlinkDeployment{testFlinkDeployment(application)}, nil + } + + mockK8Cluster := stateMachineForTest.k8Cluster.(*k8mock.K8Cluster) + mockK8Cluster.UpdateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + application := object.(*v1alpha1.FlinkApplication) + assert.Equal(t, v1alpha1.FlinkApplicationUpdating, application.Status.Phase) + updateInvoked = true + return nil + } + err := stateMachineForTest.Handle(context.Background(), &v1alpha1.FlinkApplication{ + Status: v1alpha1.FlinkApplicationStatus{ + Phase: v1alpha1.FlinkApplicationRunning, + }, + }) + assert.True(t, updateInvoked) + assert.Nil(t, err) +} + +func TestRollingBack(t *testing.T) { + jobID := "j1" + + app := v1alpha1.FlinkApplication{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-app", + Namespace: "flink", + }, + Spec: v1alpha1.FlinkApplicationSpec{ + JarName: "job.jar", + Parallelism: 5, + EntryClass: "com.my.Class", + ProgramArgs: "--test", + }, + Status: v1alpha1.FlinkApplicationStatus{ + Phase: v1alpha1.FlinkApplicationRollingBackJob, + DeployHash: "old-hash", + JobStatus: v1alpha1.FlinkJobStatus{ + JarName: "old-job.jar", + Parallelism: 10, + EntryClass: "com.my.OldClass", + ProgramArgs: "--no-test", + }, + }, + } + appHash := flink.HashForApplication(&app) + + stateMachineForTest := getTestStateMachine() + mockFlinkController := stateMachineForTest.flinkController.(*mock.FlinkController) + mockFlinkController.IsServiceReadyFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (bool, error) { + assert.Equal(t, "old-hash", hash) + return true, nil + } + + startCalled := false + mockFlinkController.StartFlinkJobFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string, + jarName string, parallelism int32, entryClass string, programArgs string) (string, error) { + + startCalled = true + assert.Equal(t, "old-hash", hash) + assert.Equal(t, app.Status.JobStatus.JarName, jarName) + assert.Equal(t, app.Status.JobStatus.Parallelism, parallelism) + assert.Equal(t, app.Status.JobStatus.EntryClass, entryClass) + assert.Equal(t, app.Status.JobStatus.ProgramArgs, programArgs) + return jobID, nil + } + + getCount := 0 + mockFlinkController.GetJobsForApplicationFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) ([]client.FlinkJob, error) { + assert.Equal(t, "old-hash", hash) + var res []client.FlinkJob + if getCount == 1 { + res = []client.FlinkJob{ + { + JobID: jobID, + Status: client.Running, + }} + } + getCount++ + return res, nil + } + + mockK8Cluster := stateMachineForTest.k8Cluster.(*k8mock.K8Cluster) + + getServiceCount := 0 + mockK8Cluster.GetServiceFunc = func(ctx context.Context, namespace string, name string) (*v1.Service, error) { + assert.Equal(t, "flink", namespace) + assert.Equal(t, "test-app", name) + + hash := appHash + if getServiceCount > 0 { + hash = "old-hash" + } + + getServiceCount++ + return &v1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-app", + Namespace: "flink", + }, + Spec: v1.ServiceSpec{ + Selector: map[string]string{ + "flink-app-hash": hash, + }, + }, + }, nil + } + + updateCount := 0 + mockK8Cluster.UpdateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + if updateCount == 0 { + // update to the service + service := object.(*v1.Service) + assert.Equal(t, "old-hash", service.Spec.Selector["flink-app-hash"]) + } else if updateCount == 1 { + application := object.(*v1alpha1.FlinkApplication) + assert.Equal(t, jobFinalizer, application.Finalizers[0]) + } else if updateCount == 2 { + application := object.(*v1alpha1.FlinkApplication) + assert.Equal(t, appHash, application.Status.FailedDeployHash) + assert.Equal(t, v1alpha1.FlinkApplicationDeployFailed, application.Status.Phase) + } + + updateCount++ + return nil + } + + err := stateMachineForTest.Handle(context.Background(), &app) + assert.Nil(t, err) + err = stateMachineForTest.Handle(context.Background(), &app) + assert.Nil(t, err) + + assert.True(t, startCalled) + assert.Equal(t, 3, updateCount) +} + +func TestIsApplicationStuck(t *testing.T) { + testDuration := config.Duration{} + testDuration.Duration = 5 * time.Minute + err := controller_config.ConfigSection.SetConfig(&controller_config.Config{ + StatemachineStalenessDuration: testDuration, + }) + assert.Nil(t, err) + + stateMachineForTest := getTestStateMachine() + + lastUpdated := metav1.NewTime(time.Now()) + stateMachineForTest.clock.(*clock.FakeClock).SetTime(time.Now()) + app := &v1alpha1.FlinkApplication{ + Status: v1alpha1.FlinkApplicationStatus{ + Phase: v1alpha1.FlinkApplicationClusterStarting, + LastUpdatedAt: &lastUpdated, + DeployHash: "prevhash", + }, + } + + assert.False(t, stateMachineForTest.shouldRollback(context.Background(), app)) + + lastUpdated = metav1.NewTime(time.Now().Add(time.Duration(-8) * time.Minute)) + app.Status.LastUpdatedAt = &lastUpdated + assert.True(t, stateMachineForTest.shouldRollback(context.Background(), app)) + + app.Status.Phase = v1alpha1.FlinkApplicationRunning + assert.False(t, stateMachineForTest.shouldRollback(context.Background(), app)) + + app.Status.Phase = v1alpha1.FlinkApplicationDeployFailed + assert.False(t, stateMachineForTest.shouldRollback(context.Background(), app)) +} + +func TestDeleteWithSavepoint(t *testing.T) { + stateMachineForTest := getTestStateMachine() + jobID := "j1" + + app := v1alpha1.FlinkApplication{ + ObjectMeta: metav1.ObjectMeta{ + Finalizers: []string{jobFinalizer}, + DeletionTimestamp: &metav1.Time{Time: time.Now()}, + }, + Status: v1alpha1.FlinkApplicationStatus{ + Phase: v1alpha1.FlinkApplicationDeleting, + JobStatus: v1alpha1.FlinkJobStatus{ + JobID: jobID, + }, + }, + } + + triggerID := "t1" + savepointPath := "s3:///path/to/savepoint" + + mockFlinkController := stateMachineForTest.flinkController.(*mock.FlinkController) + mockFlinkController.CancelWithSavepointFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (string, error) { + return triggerID, nil + } + + mockFlinkController.GetJobsForApplicationFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (jobs []client.FlinkJob, err error) { + return []client.FlinkJob{ + { + JobID: jobID, + Status: "RUNNING", + }, + }, nil + } + + mockK8Cluster := stateMachineForTest.k8Cluster.(*k8mock.K8Cluster) + updateCount := 1 + mockK8Cluster.UpdateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + application := object.(*v1alpha1.FlinkApplication) + assert.Equal(t, v1alpha1.FlinkApplicationDeleting, application.Status.Phase) + + if updateCount == 1 { + assert.Equal(t, triggerID, application.Spec.SavepointInfo.TriggerID) + } else if updateCount == 2 { + assert.Equal(t, savepointPath, application.Spec.SavepointInfo.SavepointLocation) + } else if updateCount == 3 { + assert.Equal(t, 0, len(app.Finalizers)) + } + + updateCount++ + return nil + } + + err := stateMachineForTest.Handle(context.Background(), &app) + assert.NoError(t, err) + assert.Equal(t, 2, updateCount) + + mockFlinkController.GetSavepointStatusFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (*client.SavepointResponse, error) { + return &client.SavepointResponse{ + SavepointStatus: client.SavepointStatusResponse{ + Status: client.SavePointCompleted, + }, + Operation: client.SavepointOperationResponse{ + Location: "s3:///path/to/savepoint", + }, + }, nil + } + + err = stateMachineForTest.Handle(context.Background(), &app) + assert.NoError(t, err) + + assert.Equal(t, 3, updateCount) + + mockFlinkController.GetJobsForApplicationFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (jobs []client.FlinkJob, err error) { + return []client.FlinkJob{ + { + JobID: jobID, + Status: "CANCELED", + }, + }, nil + } + + err = stateMachineForTest.Handle(context.Background(), &app) + assert.NoError(t, err) + + assert.Equal(t, 4, updateCount) + +} + +func TestDeleteWithForceCancel(t *testing.T) { + stateMachineForTest := getTestStateMachine() + + jobID := "j1" + + app := v1alpha1.FlinkApplication{ + ObjectMeta: metav1.ObjectMeta{ + Finalizers: []string{jobFinalizer}, + DeletionTimestamp: &metav1.Time{Time: time.Now()}, + }, + Spec: v1alpha1.FlinkApplicationSpec{ + DeleteMode: v1alpha1.DeleteModeForceCancel, + }, + Status: v1alpha1.FlinkApplicationStatus{ + Phase: v1alpha1.FlinkApplicationDeleting, + JobStatus: v1alpha1.FlinkJobStatus{ + JobID: jobID, + }, + }, + } + + mockFlinkController := stateMachineForTest.flinkController.(*mock.FlinkController) + + mockFlinkController.GetJobsForApplicationFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (jobs []client.FlinkJob, err error) { + return []client.FlinkJob{ + { + JobID: jobID, + Status: "RUNNING", + }, + }, nil + } + + cancelled := false + mockFlinkController.ForceCancelFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) error { + cancelled = true + return nil + } + + mockK8Cluster := stateMachineForTest.k8Cluster.(*k8mock.K8Cluster) + updateCount := 1 + mockK8Cluster.UpdateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + application := object.(*v1alpha1.FlinkApplication) + assert.Equal(t, v1alpha1.FlinkApplicationDeleting, application.Status.Phase) + + if updateCount == 1 { + assert.Equal(t, 0, len(app.Finalizers)) + } + + updateCount++ + return nil + } + + err := stateMachineForTest.Handle(context.Background(), &app) + assert.NoError(t, err) + assert.Equal(t, 1, updateCount) + assert.True(t, cancelled) + + mockFlinkController.GetJobsForApplicationFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (jobs []client.FlinkJob, err error) { + return []client.FlinkJob{ + { + JobID: jobID, + Status: "CANCELED", + }, + }, nil + } + + err = stateMachineForTest.Handle(context.Background(), &app) + assert.NoError(t, err) + assert.Equal(t, 2, updateCount) +} + +func TestDeleteModeNone(t *testing.T) { + stateMachineForTest := getTestStateMachine() + + app := v1alpha1.FlinkApplication{ + ObjectMeta: metav1.ObjectMeta{ + Finalizers: []string{jobFinalizer}, + DeletionTimestamp: &metav1.Time{Time: time.Now()}, + }, + Spec: v1alpha1.FlinkApplicationSpec{ + DeleteMode: v1alpha1.DeleteModeNone, + }, + Status: v1alpha1.FlinkApplicationStatus{ + Phase: v1alpha1.FlinkApplicationDeleting, + }, + } + + jobID := "j1" + + mockFlinkController := stateMachineForTest.flinkController.(*mock.FlinkController) + + mockFlinkController.GetJobsForApplicationFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) (jobs []client.FlinkJob, err error) { + return []client.FlinkJob{ + { + JobID: jobID, + Status: "RUNNING", + }, + }, nil + } + + cancelled := false + mockFlinkController.ForceCancelFunc = func(ctx context.Context, application *v1alpha1.FlinkApplication, hash string) error { + cancelled = true + return nil + } + + mockK8Cluster := stateMachineForTest.k8Cluster.(*k8mock.K8Cluster) + updateCount := 1 + mockK8Cluster.UpdateK8ObjectFunc = func(ctx context.Context, object runtime.Object) error { + application := object.(*v1alpha1.FlinkApplication) + assert.Equal(t, v1alpha1.FlinkApplicationDeleting, application.Status.Phase) + + if updateCount == 1 { + assert.Equal(t, 0, len(app.Finalizers)) + } + + updateCount++ + return nil + } + + err := stateMachineForTest.Handle(context.Background(), &app) + assert.NoError(t, err) + assert.Equal(t, 2, updateCount) + assert.False(t, cancelled) +} diff --git a/pkg/controller/k8/cluster.go b/pkg/controller/k8/cluster.go new file mode 100644 index 00000000..a3af3794 --- /dev/null +++ b/pkg/controller/k8/cluster.go @@ -0,0 +1,115 @@ +package k8 + +import ( + "context" + + "github.com/lyft/flytestdlib/logger" + v1 "k8s.io/api/apps/v1" + coreV1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/manager" +) + +const ( + Deployment = "Deployment" + Pod = "Pod" + Service = "Service" + Endpoints = "Endpoints" + Ingress = "Ingress" +) + +type ClusterInterface interface { + // Tries to fetch the value from the controller runtime manager cache, if it does not exist, call API server + GetDeploymentsWithLabel(ctx context.Context, namespace string, labelMap map[string]string) (*v1.DeploymentList, error) + + // Tries to fetch the value from the controller runtime manager cache, if it does not exist, call API server + GetService(ctx context.Context, namespace string, name string) (*coreV1.Service, error) + + CreateK8Object(ctx context.Context, object runtime.Object) error + UpdateK8Object(ctx context.Context, object runtime.Object) error + DeleteK8Object(ctx context.Context, object runtime.Object) error +} + +func NewK8Cluster(mgr manager.Manager) ClusterInterface { + return &Cluster{ + cache: mgr.GetCache(), + client: mgr.GetClient(), + } +} + +type Cluster struct { + cache cache.Cache + client client.Client +} + +func (k *Cluster) GetService(ctx context.Context, namespace string, name string) (*coreV1.Service, error) { + service := &coreV1.Service{ + TypeMeta: metav1.TypeMeta{ + APIVersion: coreV1.SchemeGroupVersion.String(), + Kind: Service, + }, + } + key := types.NamespacedName{ + Name: name, + Namespace: namespace, + } + err := k.cache.Get(ctx, key, service) + if err != nil { + if IsK8sObjectDoesNotExist(err) { + err := k.client.Get(ctx, key, service) + if err != nil { + logger.Warnf(ctx, "Failed to get service %v", err) + return nil, err + } + } + logger.Warnf(ctx, "Failed to get service from cache %v", err) + return nil, err + } + return service, nil +} + +func (k *Cluster) GetDeploymentsWithLabel(ctx context.Context, namespace string, labelMap map[string]string) (*v1.DeploymentList, error) { + deploymentList := &v1.DeploymentList{ + TypeMeta: metav1.TypeMeta{ + APIVersion: v1.SchemeGroupVersion.String(), + Kind: Deployment, + }, + } + labelSelector := labels.SelectorFromSet(labelMap) + options := &client.ListOptions{ + LabelSelector: labelSelector, + } + err := k.cache.List(ctx, options, deploymentList) + if err != nil { + if IsK8sObjectDoesNotExist(err) { + err := k.client.List(ctx, options, deploymentList) + if err != nil { + logger.Warnf(ctx, "Failed to list deployments %v", err) + return nil, err + } + } + logger.Warnf(ctx, "Failed to list deployments from cache %v", err) + return nil, err + } + return deploymentList, nil +} + +func (k *Cluster) CreateK8Object(ctx context.Context, object runtime.Object) error { + objCreate := object.DeepCopyObject() + return k.client.Create(ctx, objCreate) +} + +func (k *Cluster) UpdateK8Object(ctx context.Context, object runtime.Object) error { + objUpdate := object.DeepCopyObject() + return k.client.Update(ctx, objUpdate) +} + +func (k *Cluster) DeleteK8Object(ctx context.Context, object runtime.Object) error { + objDelete := object.DeepCopyObject() + return k.client.Delete(ctx, objDelete) +} diff --git a/pkg/controller/k8/mock/mock_k8.go b/pkg/controller/k8/mock/mock_k8.go new file mode 100644 index 00000000..cb43f207 --- /dev/null +++ b/pkg/controller/k8/mock/mock_k8.go @@ -0,0 +1,58 @@ +package mock + +import ( + "context" + + v1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +type GetDeploymentsWithLabelFunc func(ctx context.Context, namespace string, labelMap map[string]string) (*v1.DeploymentList, error) +type CreateK8ObjectFunc func(ctx context.Context, object runtime.Object) error +type GetServiceFunc func(ctx context.Context, namespace string, name string) (*corev1.Service, error) +type UpdateK8ObjectFunc func(ctx context.Context, object runtime.Object) error +type DeleteK8ObjectFunc func(ctx context.Context, object runtime.Object) error + +type K8Cluster struct { + GetDeploymentsWithLabelFunc GetDeploymentsWithLabelFunc + GetServiceFunc GetServiceFunc + CreateK8ObjectFunc CreateK8ObjectFunc + UpdateK8ObjectFunc UpdateK8ObjectFunc + DeleteK8ObjectFunc DeleteK8ObjectFunc +} + +func (m *K8Cluster) GetDeploymentsWithLabel(ctx context.Context, namespace string, labelMap map[string]string) (*v1.DeploymentList, error) { + if m.GetDeploymentsWithLabelFunc != nil { + return m.GetDeploymentsWithLabelFunc(ctx, namespace, labelMap) + } + return nil, nil +} + +func (m *K8Cluster) GetService(ctx context.Context, namespace string, name string) (*corev1.Service, error) { + if m.GetServiceFunc != nil { + return m.GetServiceFunc(ctx, namespace, name) + } + return nil, nil +} + +func (m *K8Cluster) CreateK8Object(ctx context.Context, object runtime.Object) error { + if m.CreateK8ObjectFunc != nil { + return m.CreateK8ObjectFunc(ctx, object) + } + return nil +} + +func (m *K8Cluster) UpdateK8Object(ctx context.Context, object runtime.Object) error { + if m.UpdateK8ObjectFunc != nil { + return m.UpdateK8ObjectFunc(ctx, object) + } + return nil +} + +func (m *K8Cluster) DeleteK8Object(ctx context.Context, object runtime.Object) error { + if m.DeleteK8ObjectFunc != nil { + return m.DeleteK8ObjectFunc(ctx, object) + } + return nil +} diff --git a/pkg/controller/k8/utils.go b/pkg/controller/k8/utils.go new file mode 100644 index 00000000..e601533a --- /dev/null +++ b/pkg/controller/k8/utils.go @@ -0,0 +1,69 @@ +package k8 + +import ( + "github.com/lyft/flinkk8soperator/pkg/apis/app/v1alpha1" + v1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + AppKey = "flink-app" +) + +func IsK8sObjectDoesNotExist(err error) bool { + return k8serrors.IsNotFound(err) || k8serrors.IsGone(err) || k8serrors.IsResourceExpired(err) +} + +func GetAppLabel(appName string) map[string]string { + return map[string]string{ + AppKey: appName, + } +} + +func GetDeploymentWithName(deployments []v1.Deployment, name string) *v1.Deployment { + if len(deployments) == 0 { + return nil + } + for _, deployment := range deployments { + if deployment.Name == name { + return &deployment + } + } + return nil +} + +func CreateEvent(app *v1alpha1.FlinkApplication, fieldPath string, eventType string, reason string, message string) corev1.Event { + eventTime := metav1.Now() + + objectReference := corev1.ObjectReference{ + Kind: app.Kind, + Name: app.Name, + Namespace: app.Namespace, + UID: app.UID, + APIVersion: app.APIVersion, + FieldPath: fieldPath, + } + + return corev1.Event{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "Event", + }, + ObjectMeta: metav1.ObjectMeta{ + Namespace: app.Namespace, + GenerateName: "event", + }, + Reason: reason, + Message: message, + InvolvedObject: objectReference, + Source: corev1.EventSource{ + Component: "flinkk8soperator", + }, + FirstTimestamp: eventTime, + LastTimestamp: eventTime, + Type: eventType, + } + +} diff --git a/pkg/controller/k8/utils_test.go b/pkg/controller/k8/utils_test.go new file mode 100644 index 00000000..0b8f87be --- /dev/null +++ b/pkg/controller/k8/utils_test.go @@ -0,0 +1,39 @@ +package k8 + +import ( + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/apps/v1" +) + +func TestGetAppLabel(t *testing.T) { + appName := "app_name" + appLabel := GetAppLabel(appName) + assert.Equal(t, map[string]string{ + "flink-app": appName, + }, appLabel) +} + +func TestGetDeploymentWithName(t *testing.T) { + name := "jm-name" + dep := v1.Deployment{} + dep.Name = name + deployments := []v1.Deployment{ + dep, + } + actualDeployment := GetDeploymentWithName(deployments, name) + assert.NotNil(t, actualDeployment) + assert.Equal(t, dep, *actualDeployment) +} + +func TestGetDeploymentNotExists(t *testing.T) { + name := "jm-name" + dep := v1.Deployment{} + dep.Name = name + deployments := []v1.Deployment{ + dep, + } + actualDeployment := GetDeploymentWithName(deployments, "random") + assert.Nil(t, actualDeployment) +} diff --git a/script/lint b/script/lint new file mode 100755 index 00000000..7ff4c8f8 --- /dev/null +++ b/script/lint @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +#echo "************************ golint *********************************" +#for pkg in $(glide nv); do +# golint $pkg | grep -v comment +#done +#echo "*****************************************************************" + +echo "************************ govet **********************************" +go vet $(glide nv) +echo "*****************************************************************" + +echo "************************ goimports ******************************" +goimports -d $(find . -type f -name '*.go' -not -path "./vendor/*") +echo "*****************************************************************" + +echo "************************ gofmt *********************************" +gofmt -s -w $(find . -type f -name '*.go' -not -path "./vendor/*") +echo "*****************************************************************" \ No newline at end of file diff --git a/tmp/build/Dockerfile b/tmp/build/Dockerfile new file mode 100644 index 00000000..0b3e86f9 --- /dev/null +++ b/tmp/build/Dockerfile @@ -0,0 +1,6 @@ +FROM alpine:3.6 + +RUN adduser -D flinkk8soperator +USER flinkk8soperator + +ADD tmp/_output/bin/flinkk8soperator /usr/local/bin/flinkk8soperator diff --git a/tmp/build/build.sh b/tmp/build/build.sh new file mode 100755 index 00000000..ed9d4418 --- /dev/null +++ b/tmp/build/build.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset +set -o pipefail + +if ! which go > /dev/null; then + echo "golang needs to be installed" + exit 1 +fi + +BIN_DIR="$(pwd)/tmp/_output/bin" +mkdir -p ${BIN_DIR} +PROJECT_NAME="flinkk8soperator" +REPO_PATH="github.com/lyft/flinkk8soperator/flinkk8soperator" +BUILD_PATH="${REPO_PATH}/cmd/${PROJECT_NAME}" +echo "building "${PROJECT_NAME}"..." +GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o ${BIN_DIR}/${PROJECT_NAME} $BUILD_PATH diff --git a/tmp/build/docker_build.sh b/tmp/build/docker_build.sh new file mode 100755 index 00000000..da98858d --- /dev/null +++ b/tmp/build/docker_build.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +if ! which docker > /dev/null; then + echo "docker needs to be installed" + exit 1 +fi + +: ${IMAGE:?"Need to set IMAGE, e.g. gcr.io//-operator"} + +echo "building container ${IMAGE}..." +docker build -t "${IMAGE}" -f tmp/build/Dockerfile . diff --git a/tmp/codegen/boilerplate.go.txt b/tmp/codegen/boilerplate.go.txt new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/tmp/codegen/boilerplate.go.txt @@ -0,0 +1 @@ + diff --git a/tmp/codegen/update-generated.sh b/tmp/codegen/update-generated.sh new file mode 100755 index 00000000..eaa93acc --- /dev/null +++ b/tmp/codegen/update-generated.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset +set -o pipefail + +vendor/k8s.io/code-generator/generate-groups.sh \ +deepcopy,client \ +github.com/lyft/flinkk8soperator/pkg/client \ +github.com/lyft/flinkk8soperator/pkg/apis \ +app:v1alpha1 \ +--go-header-file "./tmp/codegen/boilerplate.go.txt" diff --git a/version/version.go b/version/version.go new file mode 100644 index 00000000..e3e130bf --- /dev/null +++ b/version/version.go @@ -0,0 +1,5 @@ +package version + +var ( + Version = "0.0.1" +)