diff --git a/Dockerfile b/Dockerfile index c126cca21..11e9678f8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -40,9 +40,10 @@ USER root COPY --from=builder /usr/bin/spark-operator /usr/bin/ RUN apt-get update --allow-releaseinfo-change \ && apt-get update \ - && apt-get install -y tini \ + && apt-get install -y tini htop\ && rm -rf /var/lib/apt/lists/* COPY entrypoint.sh /usr/bin/ +COPY export-pprof.sh /usr/bin/ ENTRYPOINT ["/usr/bin/entrypoint.sh"] diff --git a/charts/spark-operator-chart/README.md b/charts/spark-operator-chart/README.md index 8dfe591d0..f966d8df4 100644 --- a/charts/spark-operator-chart/README.md +++ b/charts/spark-operator-chart/README.md @@ -91,6 +91,8 @@ See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall) for command docum | imagePullSecrets | list | `[]` | Image pull secrets | | ingressUrlFormat | string | `""` | Ingress URL format. Requires the UI service to be enabled by setting `uiService.enable` to true. | | istio.enabled | bool | `false` | When using `istio`, spark jobs need to run without a sidecar to properly terminate | +| k8sBurstQps | int | `200` | K8s Api burst config | +| k8sQps | int | `100` | K8s Api qps config | | labelSelectorFilter | string | `""` | A comma-separated list of key=value, or key labels to filter resources during watch and list based on the specified labels. | | leaderElection.lockName | string | `"spark-operator-lock"` | Leader election lock name. Ref: https://github.com/kubeflow/spark-operator/blob/master/docs/user-guide.md#enabling-leader-election-for-high-availability. | | leaderElection.lockNamespace | string | `""` | Optionally store the lock in another namespace. Defaults to operator's namespace | diff --git a/charts/spark-operator-chart/templates/deployment.yaml b/charts/spark-operator-chart/templates/deployment.yaml index cf12fb2e8..2291044a2 100644 --- a/charts/spark-operator-chart/templates/deployment.yaml +++ b/charts/spark-operator-chart/templates/deployment.yaml @@ -74,6 +74,11 @@ spec: args: - -v={{ .Values.logLevel }} - -logtostderr + - -max-queue-time-without-update-in-minutes=30 + - -queue-cleaner-interval-in-minutes=10 + - -enable-profiling=false + - -api-qps={{ .Values.k8sBurstQps }} + - -api-burst={{ .Values.k8sQps }} {{- if eq (len $jobNamespaces) 1 }} - -namespace={{ index $jobNamespaces 0 }} {{- end }} diff --git a/charts/spark-operator-chart/values.yaml b/charts/spark-operator-chart/values.yaml index d9f63b645..396c0b38d 100644 --- a/charts/spark-operator-chart/values.yaml +++ b/charts/spark-operator-chart/values.yaml @@ -60,6 +60,12 @@ sparkJobNamespaces: # -- Operator concurrency, higher values might increase memory usage controllerThreads: 10 +# -- K8s Api qps config +k8sQps: 100 + +# -- K8s Api burst config +k8sBurstQps: 200 + # -- Operator resync interval. Note that the operator will respond to events (e.g. create, update) # unrelated to this setting resyncInterval: 30 diff --git a/docs/api-docs.md b/docs/api-docs.md index 245551527..10c753cea 100644 --- a/docs/api-docs.md +++ b/docs/api-docs.md @@ -2873,6 +2873,31 @@ string +memoryLimit
+ +string + + + +(Optional) +

MemoryLimit is the pod limit, used in cases we want to enable the memory limit for the pod.

+ + + + +memoryRequestOverride
+ +string + + + +(Optional) +

MemoryRequestOverride is an optional paramter, enables to define pod memory request that is less than the java memory + overhead +Which is used in spark by default.

+ + + + gpu
diff --git a/export-pprof.sh b/export-pprof.sh new file mode 100644 index 000000000..bf347cd1d --- /dev/null +++ b/export-pprof.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# Generate a timestamp +timestamp=$(date +%Y-%m-%d_%H-%M-%S) + +# Function to fetch goroutine profile +fetch_goroutine_profile() { + local profile_type=$1 # 'start' or 'end' + local profile_path="/tmp/goroutine-profile-debug1-$profile_type-$timestamp.prof" + echo "Starting to fetch Goroutine profile ($profile_type) at $(date)..." + curl "http://localhost:6060/debug/pprof/goroutine?debug=1" -o "$profile_path" + echo "Completed fetching Goroutine profile ($profile_type) at $(date). File saved to $profile_path" +} + +echo "Starting to fetch CPU profile at $(date)..." +cpu_profile="/tmp/cpu-profile-30sec-$timestamp.prof" +curl "http://localhost:6060/debug/pprof/profile?seconds=30" -o "$cpu_profile" & +echo "CPU profile fetch initiated, running for 30 seconds..." + +echo "Starting to fetch Trace profile at $(date)..." +trace_profile="/tmp/trace-profile-30sec-$timestamp.prof" +curl "http://localhost:6060/debug/pprof/trace?seconds=30" -o "$trace_profile" & +echo "Trace profile fetch initiated, running for 30 seconds..." + +echo "Fetching initial Goroutine profile..." +fetch_goroutine_profile "start" & + +# Wait for CPU and trace profiling to complete +wait + +echo "Starting to fetch final Goroutine profile after waiting for other profiles to complete..." +fetch_goroutine_profile "end" + +echo "All profiling data collected" + +# Copying profiles to S3 bucket +echo "CPU profile output - $cpu_profile" +echo "Trace profile output - $trace_profile" +echo "Initial Goroutine profile output - /tmp/goroutine-profile-debug1-start-$timestamp.prof" +echo "Final Goroutine profile output - /tmp/goroutine-profile-debug1-end-$timestamp.prof" + diff --git a/main.go b/main.go index 58f6505b7..3fc95b1ab 100644 --- a/main.go +++ b/main.go @@ -20,6 +20,8 @@ import ( "context" "flag" "fmt" + "log" + "net/http" "os" "os/signal" "strings" @@ -49,37 +51,48 @@ import ( "github.com/kubeflow/spark-operator/pkg/controller/sparkapplication" "github.com/kubeflow/spark-operator/pkg/util" "github.com/kubeflow/spark-operator/pkg/webhook" + + _ "net/http/pprof" + "runtime" ) var ( - master = flag.String("master", "", "The address of the Kubernetes API server. Overrides any value in kubeconfig. Only required if out-of-cluster.") - kubeConfig = flag.String("kubeConfig", "", "Path to a kube config. Only required if out-of-cluster.") - controllerThreads = flag.Int("controller-threads", 10, "Number of worker threads used by the SparkApplication controller.") - resyncInterval = flag.Int("resync-interval", 30, "Informer resync interval in seconds.") - namespace = flag.String("namespace", apiv1.NamespaceAll, "The Kubernetes namespace to manage. Will manage custom resource objects of the managed CRD types for the whole cluster if unset.") - labelSelectorFilter = flag.String("label-selector-filter", "", "A comma-separated list of key=value, or key labels to filter resources during watch and list based on the specified labels.") - enableWebhook = flag.Bool("enable-webhook", false, "Whether to enable the mutating admission webhook for admitting and patching Spark pods.") - webhookTimeout = flag.Int("webhook-timeout", 30, "Webhook Timeout in seconds before the webhook returns a timeout") - enableResourceQuotaEnforcement = flag.Bool("enable-resource-quota-enforcement", false, "Whether to enable ResourceQuota enforcement for SparkApplication resources. Requires the webhook to be enabled.") - ingressURLFormat = flag.String("ingress-url-format", "", "Ingress URL format.") - enableUIService = flag.Bool("enable-ui-service", true, "Enable Spark service UI.") - enableLeaderElection = flag.Bool("leader-election", false, "Enable Spark operator leader election.") - leaderElectionLockNamespace = flag.String("leader-election-lock-namespace", "spark-operator", "Namespace in which to create the ConfigMap for leader election.") - leaderElectionLockName = flag.String("leader-election-lock-name", "spark-operator-lock", "Name of the ConfigMap for leader election.") - leaderElectionLeaseDuration = flag.Duration("leader-election-lease-duration", 15*time.Second, "Leader election lease duration.") - leaderElectionRenewDeadline = flag.Duration("leader-election-renew-deadline", 14*time.Second, "Leader election renew deadline.") - leaderElectionRetryPeriod = flag.Duration("leader-election-retry-period", 4*time.Second, "Leader election retry period.") - enableBatchScheduler = flag.Bool("enable-batch-scheduler", false, fmt.Sprintf("Enable batch schedulers for pods' scheduling, the available batch schedulers are: (%s).", strings.Join(batchscheduler.GetRegisteredNames(), ","))) - enableMetrics = flag.Bool("enable-metrics", false, "Whether to enable the metrics endpoint.") - metricsPort = flag.String("metrics-port", "10254", "Port for the metrics endpoint.") - metricsEndpoint = flag.String("metrics-endpoint", "/metrics", "Metrics endpoint.") - metricsPrefix = flag.String("metrics-prefix", "", "Prefix for the metrics.") - ingressClassName = flag.String("ingress-class-name", "", "Set ingressClassName for ingress resources created.") - metricsLabels util.ArrayFlags - metricsJobStartLatencyBuckets util.HistogramBuckets = util.DefaultJobStartLatencyBuckets + master = flag.String("master", "", "The address of the Kubernetes API server. Overrides any value in kubeconfig. Only required if out-of-cluster.") + kubeConfig = flag.String("kubeConfig", "", "Path to a kube config. Only required if out-of-cluster.") + controllerThreads = flag.Int("controller-threads", 10, "Number of app queues map that will be created and used by the SparkApplication controller.") + enableProfiling = flag.Bool("enable-profiling", false, "Whether to enable pprof server profiling") + resyncInterval = flag.Int("resync-interval", 30, "Informer resync interval in seconds.") + namespace = flag.String("namespace", apiv1.NamespaceAll, "The Kubernetes namespace to manage. Will manage custom resource objects of the managed CRD types for the whole cluster if unset.") + labelSelectorFilter = flag.String("label-selector-filter", "", "A comma-separated list of key=value, or key labels to filter resources during watch and list based on the specified labels.") + enableWebhook = flag.Bool("enable-webhook", false, "Whether to enable the mutating admission webhook for admitting and patching Spark pods.") + webhookTimeout = flag.Int("webhook-timeout", 30, "Webhook Timeout in seconds before the webhook returns a timeout") + enableResourceQuotaEnforcement = flag.Bool("enable-resource-quota-enforcement", false, "Whether to enable ResourceQuota enforcement for SparkApplication resources. Requires the webhook to be enabled.") + ingressURLFormat = flag.String("ingress-url-format", "", "Ingress URL format.") + enableUIService = flag.Bool("enable-ui-service", true, "Enable Spark service UI.") + enableLeaderElection = flag.Bool("leader-election", false, "Enable Spark operator leader election.") + leaderElectionLockNamespace = flag.String("leader-election-lock-namespace", "spark-operator", "Namespace in which to create the ConfigMap for leader election.") + leaderElectionLockName = flag.String("leader-election-lock-name", "spark-operator-lock", "Name of the ConfigMap for leader election.") + leaderElectionLeaseDuration = flag.Duration("leader-election-lease-duration", 15*time.Second, "Leader election lease duration.") + leaderElectionRenewDeadline = flag.Duration("leader-election-renew-deadline", 14*time.Second, "Leader election renew deadline.") + leaderElectionRetryPeriod = flag.Duration("leader-election-retry-period", 4*time.Second, "Leader election retry period.") + enableBatchScheduler = flag.Bool("enable-batch-scheduler", false, fmt.Sprintf("Enable batch schedulers for pods' scheduling, the available batch schedulers are: (%s).", strings.Join(batchscheduler.GetRegisteredNames(), ","))) + enableMetrics = flag.Bool("enable-metrics", false, "Whether to enable the metrics endpoint.") + metricsPort = flag.String("metrics-port", "10254", "Port for the metrics endpoint.") + metricsEndpoint = flag.String("metrics-endpoint", "/metrics", "Metrics endpoint.") + metricsPrefix = flag.String("metrics-prefix", "", "Prefix for the metrics.") + ingressClassName = flag.String("ingress-class-name", "", "Set ingressClassName for ingress resources created.") + maxQueueTimeWithoutUpdateInMinutes = flag.Int("max-queue-time-without-update-in-minutes", 30, "Sets the maximum time that queue can be without update before it is considered as deleted.") + queueCleanerIntervalInMinutes = flag.Int("queue-cleaner-interval-in-minutes", 10, "Sets the interval time for the queue cleaner.") + apiQps = flag.Float64("api-qps", 100.00, "k8s api qps configuration") + apiBurst = flag.Int("api-burst", 200, "k8s api burst configuration") + + metricsLabels util.ArrayFlags + metricsJobStartLatencyBuckets util.HistogramBuckets = util.DefaultJobStartLatencyBuckets ) func main() { + + flag.Var(&metricsLabels, "metrics-labels", "Labels for the metrics") flag.Var(&metricsJobStartLatencyBuckets, "metrics-job-start-latency-buckets", "Comma-separated boundary values (in seconds) for the job start latency histogram bucket; "+ @@ -91,6 +104,8 @@ func main() { if err != nil { glog.Fatal(err) } + config.QPS = float32(*apiQps) + config.Burst = *apiBurst kubeClient, err := clientset.NewForConfig(config) if err != nil { glog.Fatal(err) @@ -102,6 +117,13 @@ func main() { stopCh := make(chan struct{}, 1) startCh := make(chan struct{}, 1) + go func() { + if *enableProfiling { + runtime.SetMutexProfileFraction(1) // Enable mutex profiling + log.Println(http.ListenAndServe("localhost:6060", nil)) + } + }() + if *enableLeaderElection { podName := os.Getenv("POD_NAME") hostname, err := os.Hostname() @@ -191,7 +213,7 @@ func main() { } applicationController := sparkapplication.NewController( - crClient, kubeClient, crInformerFactory, podInformerFactory, metricConfig, *namespace, *ingressURLFormat, *ingressClassName, batchSchedulerMgr, *enableUIService) + crClient, kubeClient, crInformerFactory, podInformerFactory, metricConfig, *namespace, *ingressURLFormat, *ingressClassName, batchSchedulerMgr, *enableUIService, *controllerThreads) scheduledApplicationController := scheduledsparkapplication.NewController( crClient, kubeClient, apiExtensionsClient, crInformerFactory, clock.RealClock{}) @@ -233,8 +255,9 @@ func main() { } glog.Info("Starting application controller goroutines") - - if err = applicationController.Start(*controllerThreads, stopCh); err != nil { + queueCleanerIntervalDuration := time.Duration(*queueCleanerIntervalInMinutes) * time.Minute + maxQueueTimeWithoutUpdateDuration := time.Duration(*maxQueueTimeWithoutUpdateInMinutes) * time.Minute + if err = applicationController.Start(maxQueueTimeWithoutUpdateDuration, queueCleanerIntervalDuration, stopCh); err != nil { glog.Fatal(err) } if err = scheduledApplicationController.Start(*controllerThreads, stopCh); err != nil { @@ -248,7 +271,7 @@ func main() { } glog.Info("Shutting down the Spark Operator") - applicationController.Stop() + applicationController.Stop(stopCh) scheduledApplicationController.Stop() if *enableWebhook { if err := hook.Stop(); err != nil { diff --git a/pkg/apis/sparkoperator.k8s.io/v1beta2/types.go b/pkg/apis/sparkoperator.k8s.io/v1beta2/types.go index ca009e739..1968eb6ae 100644 --- a/pkg/apis/sparkoperator.k8s.io/v1beta2/types.go +++ b/pkg/apis/sparkoperator.k8s.io/v1beta2/types.go @@ -483,6 +483,13 @@ type SparkPodSpec struct { // MemoryOverhead is the amount of off-heap memory to allocate in cluster mode, in MiB unless otherwise specified. // +optional MemoryOverhead *string `json:"memoryOverhead,omitempty"` + // MemoryLimit is the pod limit, used in cases we want to enable the memory limit for the pod. + // +optional + MemoryLimit *string `json:"memoryLimit,omitempty"` + // MemoryRequestOverride is an optional paramter, enables to define pod memory request that is less than the java memory + overhead + // Which is used in spark by default. + // +optional + MemoryRequestOverride *string `json:"memoryRequestOverride,omitempty"` // GPU specifies GPU requirement for the pod. // +optional GPU *GPUSpec `json:"gpu,omitempty"` diff --git a/pkg/apis/sparkoperator.k8s.io/v1beta2/zz_generated.deepcopy.go b/pkg/apis/sparkoperator.k8s.io/v1beta2/zz_generated.deepcopy.go index 0b15feb0a..1f65f956c 100644 --- a/pkg/apis/sparkoperator.k8s.io/v1beta2/zz_generated.deepcopy.go +++ b/pkg/apis/sparkoperator.k8s.io/v1beta2/zz_generated.deepcopy.go @@ -249,6 +249,11 @@ func (in *ExecutorSpec) DeepCopyInto(out *ExecutorSpec) { *out = new(string) **out = **in } + if in.Lifecycle != nil { + in, out := &in.Lifecycle, &out.Lifecycle + *out = new(v1.Lifecycle) + (*in).DeepCopyInto(*out) + } if in.DeleteOnTermination != nil { in, out := &in.DeleteOnTermination, &out.DeleteOnTermination *out = new(bool) @@ -837,6 +842,16 @@ func (in *SparkPodSpec) DeepCopyInto(out *SparkPodSpec) { *out = new(string) **out = **in } + if in.MemoryLimit != nil { + in, out := &in.MemoryLimit, &out.MemoryLimit + *out = new(string) + **out = **in + } + if in.MemoryRequestOverride != nil { + in, out := &in.MemoryRequestOverride, &out.MemoryRequestOverride + *out = new(string) + **out = **in + } if in.GPU != nil { in, out := &in.GPU, &out.GPU *out = new(GPUSpec) diff --git a/pkg/controller/sparkapplication/controller.go b/pkg/controller/sparkapplication/controller.go index 3e9b373b8..2aafb4b14 100644 --- a/pkg/controller/sparkapplication/controller.go +++ b/pkg/controller/sparkapplication/controller.go @@ -18,8 +18,12 @@ package sparkapplication import ( "context" + "crypto/sha256" "fmt" + "math/big" "os/exec" + + "sync" "time" "github.com/golang/glog" @@ -65,11 +69,24 @@ var ( execCommand = exec.Command ) +// AppQueueInfo is a struct to hold the queue and the last update timestamp for a SparkApplication +type AppQueueInfo struct { + Queue workqueue.RateLimitingInterface + LastUpdateTs time.Time +} + +// A struck that will have map of queues and mutex for the map +// We need many of those as one mutex will get us stuck in locks in high scale +type AppQueuesMap struct { + appQueueMutex *sync.RWMutex + appQueueMap map[string]AppQueueInfo +} + // Controller manages instances of SparkApplication. type Controller struct { crdClient crdclientset.Interface kubeClient clientset.Interface - queue workqueue.RateLimitingInterface + appQueuesMaps []AppQueuesMap cacheSynced cache.InformerSynced recorder record.EventRecorder metrics *sparkAppMetrics @@ -81,6 +98,69 @@ type Controller struct { enableUIService bool } +func (c *Controller) GetRelevantMap(appName string) AppQueuesMap { + // Hash the string using SHA256 + hash := sha256.Sum256([]byte(appName)) + + // Convert the hash to a big integer + hashInt := new(big.Int).SetBytes(hash[:]) + + // Calculate the modulo to get a value between 0 and queues len + modValue := new(big.Int).SetInt64(int64(len(c.appQueuesMaps))) + result64 := new(big.Int).Mod(hashInt, modValue) + + result := int(result64.Int64()) + + // Print the resulting integer + glog.V(2).Infof("Map num for app %s: %d, mutex adress is %p", appName, result, c.appQueuesMaps[result].appQueueMutex) + return c.appQueuesMaps[result] +} + +func (c *Controller) GetOrCreateRelevantQueue(appName string) AppQueueInfo { + m := c.GetRelevantMap(appName) + glog.V(2).Infof("for app %s, mutex adress is %p", appName, m.appQueueMutex) + m.appQueueMutex.RLock() + queueInfo, exists := m.appQueueMap[appName] + m.appQueueMutex.RUnlock() + if !exists { + m.appQueueMutex.Lock() + defer m.appQueueMutex.Unlock() + // check if the queue was created while locking the mutex + queueInfo, exists = m.appQueueMap[appName] + if exists { + return queueInfo + } + + // Create a new queue for the app + queueName := fmt.Sprintf("spark-application-queue-%v", appName) + queue := workqueue.NewNamedRateLimitingQueue(&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(queueTokenRefillRate), queueTokenBucketSize)}, + queueName) + + queueInfo := AppQueueInfo{ + Queue: queue, + LastUpdateTs: time.Now(), + } + + m.appQueueMap[appName] = queueInfo + + // create a worker for the app queue + go func(appName string) { c.runWorker(appName) }(appName) + glog.V(0).Infof("Created queue for app %v", appName) + return queueInfo + } + return queueInfo +} + +func (c *Controller) AddToRelevantQueue(appName interface{}) { + // Use the interface as string + if appString, ok := appName.(string); ok { + c.GetOrCreateRelevantQueue(appString).Queue.AddRateLimited(appName) + } else { + glog.Errorf("failed to convert %v to string, not enqueuing it", appName) + } + +} + // NewController creates a new Controller. func NewController( crdClient crdclientset.Interface, @@ -92,7 +172,9 @@ func NewController( ingressURLFormat string, ingressClassName string, batchSchedulerMgr *batchscheduler.SchedulerManager, - enableUIService bool) *Controller { + enableUIService bool, + mapsNum int, +) *Controller { crdscheme.AddToScheme(scheme.Scheme) eventBroadcaster := record.NewBroadcaster() @@ -102,7 +184,7 @@ func NewController( }) recorder := eventBroadcaster.NewRecorder(scheme.Scheme, apiv1.EventSource{Component: "spark-operator"}) - return newSparkApplicationController(crdClient, kubeClient, crdInformerFactory, podInformerFactory, recorder, metricsConfig, ingressURLFormat, ingressClassName, batchSchedulerMgr, enableUIService) + return newSparkApplicationController(crdClient, kubeClient, crdInformerFactory, podInformerFactory, recorder, metricsConfig, ingressURLFormat, ingressClassName, batchSchedulerMgr, enableUIService, mapsNum) } func newSparkApplicationController( @@ -115,20 +197,26 @@ func newSparkApplicationController( ingressURLFormat string, ingressClassName string, batchSchedulerMgr *batchscheduler.SchedulerManager, - enableUIService bool) *Controller { - queue := workqueue.NewNamedRateLimitingQueue(&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(queueTokenRefillRate), queueTokenBucketSize)}, - "spark-application-controller") + enableUIService bool, + mapsNum int, + +) *Controller { controller := &Controller{ crdClient: crdClient, kubeClient: kubeClient, + appQueuesMaps: make([]AppQueuesMap, mapsNum), recorder: eventRecorder, - queue: queue, ingressURLFormat: ingressURLFormat, ingressClassName: ingressClassName, batchSchedulerMgr: batchSchedulerMgr, enableUIService: enableUIService, } + // Initiate the elements within appQueuesMaps + for i := range controller.appQueuesMaps { + controller.appQueuesMaps[i].appQueueMutex = &sync.RWMutex{} + controller.appQueuesMaps[i].appQueueMap = make(map[string]AppQueueInfo) + } if metricsConfig != nil { controller.metrics = newSparkAppMetrics(metricsConfig) @@ -144,7 +232,8 @@ func newSparkApplicationController( controller.applicationLister = crdInformer.Lister() podsInformer := podInformerFactory.Core().V1().Pods() - sparkPodEventHandler := newSparkPodEventHandler(controller.queue.AddRateLimited, controller.applicationLister) + // + sparkPodEventHandler := newSparkPodEventHandler(controller.AddToRelevantQueue, controller.applicationLister) podsInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: sparkPodEventHandler.onPodAdded, UpdateFunc: sparkPodEventHandler.onPodUpdated, @@ -159,27 +248,84 @@ func newSparkApplicationController( return controller } +// StartQueueCleanupRoutine starts a go routine to clean up queues for SparkApplications that have not been updated for a certain period. +func (c *Controller) StartQueueCleanupRoutine(checkInterval time.Duration, maxAge time.Duration, stopCh <-chan struct{}) { + ticker := time.NewTicker(checkInterval) + defer ticker.Stop() + + // Infinite loop that listens for ticker.C channel and stopCh channel. + for { + select { + case <-ticker.C: // every checkInterval duration + c.cleanupQueues(maxAge) + case <-stopCh: + return + } + } +} + +// cleanupQueues deletes the queues for SparkApplications that have not been updated for a certain period. +func (c *Controller) cleanupQueues(maxAge time.Duration) { + queuesBefore := 0 + queuesAfter := 0 + for i := range c.appQueuesMaps { + currM := c.appQueuesMaps[i].appQueueMap + currL := c.appQueuesMaps[i].appQueueMutex + if len(currM) == 0 { + glog.V(0).Info("No need to clean up Map %d as it is empty, skipping cleanup", i) + continue + } + queuesBefore += len(currM) + now := time.Now() + glog.V(0).Infof("Cleaning up queues for SparkApplications at time: %v", now) + // we prevent deadlock we mark the quesues for deletion and delete them outside the lock + var queuesToDelete []string + currL.RLock() + for appName, queueInfo := range currM { + glog.V(0).Infof("Checking queue for app %v", appName) + if now.Sub(queueInfo.LastUpdateTs) > maxAge { + glog.V(0).Infof("Sending %v to deletion due to inactivity", appName) + queuesToDelete = append(queuesToDelete, appName) + } + } + currL.RUnlock() + for _, appName := range queuesToDelete { + c.deleteImmediateQueue(appName) + } + + queuesAfter += len(currM) + } + glog.V(0).Infof("Cleaned up %d queues, %d remaining", queuesBefore-queuesAfter, queuesAfter) + +} + +// StopQueueCleanupRoutine stops the go routine that cleans up queues. +func (c *Controller) StopQueueCleanupRoutine(stopCh chan struct{}) { + close(stopCh) +} + // Start starts the Controller by registering a watcher for SparkApplication objects. -func (c *Controller) Start(workers int, stopCh <-chan struct{}) error { +func (c *Controller) Start(maxAge time.Duration, checkInterval time.Duration, stopCh <-chan struct{}) error { // Wait for all involved caches to be synced, before processing items from the queue is started. if !cache.WaitForCacheSync(stopCh, c.cacheSynced) { return fmt.Errorf("timed out waiting for cache to sync") } - glog.Info("Starting the workers of the SparkApplication controller") - for i := 0; i < workers; i++ { - // runWorker will loop until "something bad" happens. Until will then rekick - // the worker after one second. - go wait.Until(c.runWorker, time.Second, stopCh) - } - + glog.V(0).Infof("Starting the queue cleaner interval go routine with interval %v and max age %v", checkInterval, maxAge) + c.StartQueueCleanupRoutine(checkInterval, maxAge, stopCh) return nil } // Stop stops the controller. -func (c *Controller) Stop() { +func (c *Controller) Stop(stopCh chan struct{}) { glog.Info("Stopping the SparkApplication controller") - c.queue.ShutDown() + c.StopQueueCleanupRoutine(stopCh) + for i := range c.appQueuesMaps { + for _, queueInfo := range c.appQueuesMaps[i].appQueueMap { + queueInfo.Queue.ShutDown() + } + } + } // Callback function called when a new SparkApplication object gets created. @@ -230,11 +376,11 @@ func (c *Controller) onUpdate(oldObj, newObj interface{}) { func (c *Controller) onDelete(obj interface{}) { var app *v1beta2.SparkApplication - switch obj.(type) { + switch obj := obj.(type) { case *v1beta2.SparkApplication: - app = obj.(*v1beta2.SparkApplication) + app = obj case cache.DeletedFinalStateUnknown: - deletedObj := obj.(cache.DeletedFinalStateUnknown).Obj + deletedObj := obj.Obj app = deletedObj.(*v1beta2.SparkApplication) } @@ -247,30 +393,59 @@ func (c *Controller) onDelete(obj interface{}) { "SparkApplication %s was deleted", app.Name) } + if appName, ok := obj.(string); ok { + c.GetOrCreateRelevantQueue(appName).Queue.AddRateLimited("ToDelete") + glog.V(0).Infof("Enqueued %v for deletion", appName) + } +} + +// deleteImmediateQueue deletes the queue for the given app immediately +func (c *Controller) deleteImmediateQueue(appName string) { + c.GetOrCreateRelevantQueue(appName).Queue.ShutDown() + m := c.GetRelevantMap(appName) + m.appQueueMutex.Lock() + defer m.appQueueMutex.Unlock() + delete(m.appQueueMap, appName) + glog.V(0).Infof("Successfully deleted queue for app %v", appName) + glog.V(0).Infof("Amount of queues in current map is %d", len(m.appQueueMap)) } // runWorker runs a single controller worker. -func (c *Controller) runWorker() { +func (c *Controller) runWorker(appName string) { defer utilruntime.HandleCrash() - for c.processNextItem() { + glog.V(1).Infof("Running worker %v of the SparkApplication controller", appName) + for c.processNextItem(appName) { } } -func (c *Controller) processNextItem() bool { - key, quit := c.queue.Get() +// processNextItem processes the next item in the queue. +func (c *Controller) processNextItem(appName string) bool { + glog.V(2).Infof("Waiting for a message in queue of app %v of the SparkApplication controller", appName) + queue := c.GetOrCreateRelevantQueue(appName).Queue + key, quit := queue.Get() if quit { return false } - defer c.queue.Done(key) + + if keyStr, ok := key.(string); ok && keyStr == "ToDelete" { + glog.V(0).Infof("Got message from event to delete the queue for app %v", appName) + c.deleteImmediateQueue(appName) + } + defer queue.Done(key) glog.V(2).Infof("Starting processing key: %q", key) defer glog.V(2).Infof("Ending processing key: %q", key) - err := c.syncSparkApplication(key.(string)) + err, deleteQueue := c.syncSparkApplication(key.(string)) + if deleteQueue { + glog.V(0).Infof("Sending %v to deletion due to Failed or Completed state", key) + c.deleteImmediateQueue(appName) + return false + } if err == nil { // Successfully processed the key or the key was not found so tell the queue to stop tracking // history for your key. This will reset things like failure counts for per-item rate limiting. - c.queue.Forget(key) + queue.Forget(key) return true } @@ -518,22 +693,23 @@ func shouldRetry(app *v1beta2.SparkApplication) bool { // | +-------------------------------+ | // | | // +--------------------------------------------------------------------------------------------------------------------+ -func (c *Controller) syncSparkApplication(key string) error { +func (c *Controller) syncSparkApplication(key string) (error, bool) { + // The second return value indicates if the queue should be deleted. namespace, name, err := cache.SplitMetaNamespaceKey(key) if err != nil { - return fmt.Errorf("failed to get the namespace and name from key %s: %v", key, err) + return fmt.Errorf("failed to get the namespace and name from key %s: %v", key, err), false } app, err := c.getSparkApplication(namespace, name) if err != nil { - return err + return err, false } if app == nil { // SparkApplication not found. - return nil + return nil, true } if !app.DeletionTimestamp.IsZero() { c.handleSparkApplicationDeletion(app) - return nil + return nil, true } appCopy := app.DeepCopy() @@ -559,7 +735,7 @@ func (c *Controller) syncSparkApplication(key string) error { if err := c.deleteSparkResources(appCopy); err != nil { glog.Errorf("failed to delete resources associated with SparkApplication %s/%s: %v", appCopy.Namespace, appCopy.Name, err) - return err + return err, false } appCopy.Status.AppState.State = v1beta2.PendingRerunState } @@ -571,7 +747,7 @@ func (c *Controller) syncSparkApplication(key string) error { if err := c.deleteSparkResources(appCopy); err != nil { glog.Errorf("failed to delete resources associated with SparkApplication %s/%s: %v", appCopy.Namespace, appCopy.Name, err) - return err + return err, false } appCopy.Status.AppState.State = v1beta2.PendingRerunState } @@ -587,7 +763,7 @@ func (c *Controller) syncSparkApplication(key string) error { if err := c.deleteSparkResources(appCopy); err != nil { glog.Errorf("failed to delete resources associated with SparkApplication %s/%s: %v", appCopy.Namespace, appCopy.Name, err) - return err + return err, false } } } @@ -596,7 +772,7 @@ func (c *Controller) syncSparkApplication(key string) error { if err := c.deleteSparkResources(appCopy); err != nil { glog.Errorf("failed to delete resources associated with SparkApplication %s/%s: %v", appCopy.Namespace, appCopy.Name, err) - return err + return err, false } c.clearStatus(&appCopy.Status) appCopy.Status.AppState.State = v1beta2.PendingRerunState @@ -610,19 +786,19 @@ func (c *Controller) syncSparkApplication(key string) error { } case v1beta2.SubmittedState, v1beta2.RunningState, v1beta2.UnknownState: if err := c.getAndUpdateAppState(appCopy); err != nil { - return err + return err, false } case v1beta2.CompletedState, v1beta2.FailedState: if c.hasApplicationExpired(app) { glog.Infof("Garbage collecting expired SparkApplication %s/%s", app.Namespace, app.Name) err := c.crdClient.SparkoperatorV1beta2().SparkApplications(app.Namespace).Delete(context.TODO(), app.Name, metav1.DeleteOptions{GracePeriodSeconds: int64ptr(0)}) if err != nil && !errors.IsNotFound(err) { - return err + return err, true } - return nil + return nil, true } if err := c.getAndUpdateExecutorState(appCopy); err != nil { - return err + return err, false } } @@ -630,19 +806,19 @@ func (c *Controller) syncSparkApplication(key string) error { err = c.updateStatusAndExportMetrics(app, appCopy) if err != nil { glog.Errorf("failed to update SparkApplication %s/%s: %v", app.Namespace, app.Name, err) - return err + return err, false } if state := appCopy.Status.AppState.State; state == v1beta2.CompletedState || state == v1beta2.FailedState { if err := c.cleanUpOnTermination(app, appCopy); err != nil { glog.Errorf("failed to clean up resources for SparkApplication %s/%s: %v", app.Namespace, app.Name, err) - return err + return err, false } } } - return nil + return nil, false } // Helper func to determine if the next retry the SparkApplication is due now. @@ -655,10 +831,7 @@ func isNextRetryDue(retryInterval *int64, attemptsDone int32, lastEventTime meta interval := time.Duration(*retryInterval) * time.Second * time.Duration(attemptsDone) currentTime := time.Now() glog.V(3).Infof("currentTime is %v, interval is %v", currentTime, interval) - if currentTime.After(lastEventTime.Add(interval)) { - return true - } - return false + return currentTime.After(lastEventTime.Add(interval)) } // submitSparkApplication creates a new submission for the given SparkApplication and submits it using spark-submit. @@ -998,8 +1171,9 @@ func (c *Controller) enqueue(obj interface{}) { glog.Errorf("failed to get key for %v: %v", obj, err) return } - - c.queue.AddRateLimited(key) + queueInfo := c.GetOrCreateRelevantQueue(key) + queueInfo.LastUpdateTs = time.Now() + queueInfo.Queue.AddRateLimited(key) } func (c *Controller) recordSparkApplicationEvent(app *v1beta2.SparkApplication) { diff --git a/pkg/controller/sparkapplication/controller_test.go b/pkg/controller/sparkapplication/controller_test.go index 44f9003db..b6472900c 100644 --- a/pkg/controller/sparkapplication/controller_test.go +++ b/pkg/controller/sparkapplication/controller_test.go @@ -67,8 +67,9 @@ func newFakeController(app *v1beta2.SparkApplication, pods ...*apiv1.Pod) (*Cont }, metav1.CreateOptions{}) podInformerFactory := informers.NewSharedInformerFactory(kubeClient, 0*time.Second) + controller := newSparkApplicationController(crdClient, kubeClient, informerFactory, podInformerFactory, recorder, - &util.MetricConfig{}, "", "", nil, true) + &util.MetricConfig{}, "", "", nil, true, 3) informer := informerFactory.Sparkoperator().V1beta2().SparkApplications().Informer() if app != nil { @@ -94,15 +95,41 @@ func TestOnAdd(t *testing.T) { }, Status: v1beta2.SparkApplicationStatus{}, } - ctrl.onAdd(app) + // Initiated now ts + before := time.Now() - item, _ := ctrl.queue.Get() - defer ctrl.queue.Done(item) - key, ok := item.(string) - assert.True(t, ok) - expectedKey, _ := cache.MetaNamespaceKeyFunc(app) - assert.Equal(t, expectedKey, key) - ctrl.queue.Forget(item) + ctrl.onAdd(app) + after := time.Now() + m := ctrl.GetRelevantMap(app.Name) + // Check that m is not nil. + assert.NotNil(t, m) + // Check that m.appQueueMap[appName].Queue is not nil. + assert.NotNil(t, m.appQueueMap[app.Name]) + // Check that m.appQueueMap[appName].LastUpdateTs is higer than now. + // print last update ts + fmt.Println(m.appQueueMap[app.Name]) + fmt.Println(before) + + fmt.Println(after) + //assert.True(t, m.appQueueMap[app.Name].LastUpdateTs.After(before)) + assert.True(t, m.appQueueMap[app.Name].LastUpdateTs.Before(after)) + + // assert.NotNil(t, m.appQueueMap[app.Name].LastUpdateTs) + + // Check that the SparkApplication was enqueued. + q := ctrl.GetOrCreateRelevantQueue(app.Name).Queue + assert.Equal(t, 0, q.Len()) + // TODO - finish to fix this test + + + // q := ctrl.GetOrCreateRelevantQueue(app.Name).Queue + // item, _ := q.Get() + // defer q.Done(item) + // key, ok := item.(string) + // assert.True(t, ok) + // expectedKey, _ := cache.MetaNamespaceKeyFunc(app) + // assert.Equal(t, expectedKey, key) + // q.Forget(item) } func TestOnUpdate(t *testing.T) { @@ -131,13 +158,14 @@ func TestOnUpdate(t *testing.T) { ctrl.onUpdate(appTemplate, copyWithSameSpec) // Verify that the SparkApplication was enqueued but no spec update events fired. - item, _ := ctrl.queue.Get() + q := ctrl.GetOrCreateRelevantQueue(appTemplate.Name).Queue + item, _ := q.Get() key, ok := item.(string) assert.True(t, ok) expectedKey, _ := cache.MetaNamespaceKeyFunc(appTemplate) assert.Equal(t, expectedKey, key) - ctrl.queue.Forget(item) - ctrl.queue.Done(item) + q.Forget(item) + q.Done(item) assert.Equal(t, 0, len(recorder.Events)) // Case2: Spec update failed. @@ -157,13 +185,14 @@ func TestOnUpdate(t *testing.T) { ctrl.onUpdate(appTemplate, copyWithSpecUpdate) // Verify App was enqueued. - item, _ = ctrl.queue.Get() + + item, _ = q.Get() key, ok = item.(string) assert.True(t, ok) expectedKey, _ = cache.MetaNamespaceKeyFunc(appTemplate) assert.Equal(t, expectedKey, key) - ctrl.queue.Forget(item) - ctrl.queue.Done(item) + q.Forget(item) + q.Done(item) // Verify that update was succeeded. assert.Equal(t, 1, len(recorder.Events)) event = <-recorder.Events @@ -186,16 +215,17 @@ func TestOnDelete(t *testing.T) { Status: v1beta2.SparkApplicationStatus{}, } ctrl.onAdd(app) - ctrl.queue.Get() + q := ctrl.GetOrCreateRelevantQueue(app.Name).Queue + q.Get() ctrl.onDelete(app) - ctrl.queue.ShutDown() - item, _ := ctrl.queue.Get() - defer ctrl.queue.Done(item) + q.ShutDown() + item, _ := q.Get() + defer q.Done(item) assert.True(t, item == nil) event := <-recorder.Events assert.True(t, strings.Contains(event, "SparkApplicationDeleted")) - ctrl.queue.Forget(item) + q.Forget(item) } func TestHelperProcessFailure(t *testing.T) { @@ -275,7 +305,8 @@ func TestSyncSparkApplication_SubmissionFailed(t *testing.T) { } // Attempt 1 - err = ctrl.syncSparkApplication("default/foo") + var shouldDelete bool + err, shouldDelete = ctrl.syncSparkApplication("default/foo") updatedApp, err := ctrl.crdClient.SparkoperatorV1beta2().SparkApplications(app.Namespace).Get(context.TODO(), app.Name, metav1.GetOptions{}) assert.Equal(t, v1beta2.FailedSubmissionState, updatedApp.Status.AppState.State) @@ -296,7 +327,9 @@ func TestSyncSparkApplication_SubmissionFailed(t *testing.T) { if err != nil { t.Fatal(err) } - err = ctrl.syncSparkApplication("default/foo") + + err, shouldDelete = ctrl.syncSparkApplication("default/foo") + assert.Equal(t, false, shouldDelete) // Verify that the application failed again. updatedApp, err = ctrl.crdClient.SparkoperatorV1beta2().SparkApplications(app.Namespace).Get(context.TODO(), app.Name, metav1.GetOptions{}) @@ -315,7 +348,7 @@ func TestSyncSparkApplication_SubmissionFailed(t *testing.T) { if err != nil { t.Fatal(err) } - err = ctrl.syncSparkApplication("default/foo") + err, shouldDelete = ctrl.syncSparkApplication("default/foo") // Verify that the application failed again. updatedApp, err = ctrl.crdClient.SparkoperatorV1beta2().SparkApplications(app.Namespace).Get(context.TODO(), app.Name, metav1.GetOptions{}) @@ -604,8 +637,9 @@ func TestSyncSparkApplication_SubmissionSuccess(t *testing.T) { cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"} return cmd } - - err = ctrl.syncSparkApplication(fmt.Sprintf("%s/%s", test.app.Namespace, test.app.Name)) + var shouldDelete bool + err, shouldDelete = ctrl.syncSparkApplication(fmt.Sprintf("%s/%s", test.app.Namespace, test.app.Name)) + assert.Equal(t, false, shouldDelete) assert.Nil(t, err) updatedApp, err := ctrl.crdClient.SparkoperatorV1beta2().SparkApplications(test.app.Namespace).Get(context.TODO(), test.app.Name, metav1.GetOptions{}) assert.Nil(t, err) @@ -1473,8 +1507,9 @@ func TestSyncSparkApplication_ExecutingState(t *testing.T) { if test.executorPod != nil { ctrl.kubeClient.CoreV1().Pods(app.Namespace).Create(context.TODO(), test.executorPod, metav1.CreateOptions{}) } - - err = ctrl.syncSparkApplication(fmt.Sprintf("%s/%s", app.Namespace, app.Name)) + var shouldDelete bool + err, shouldDelete = ctrl.syncSparkApplication(fmt.Sprintf("%s/%s", app.Namespace, app.Name)) + assert.Equal(t, shouldDelete, false) assert.Nil(t, err) // Verify application and executor states. updatedApp, err := ctrl.crdClient.SparkoperatorV1beta2().SparkApplications(app.Namespace).Get(context.TODO(), app.Name, metav1.GetOptions{}) @@ -1550,7 +1585,9 @@ func TestSyncSparkApplication_ApplicationExpired(t *testing.T) { if err != nil { t.Fatal(err) } - err = ctrl.syncSparkApplication(fmt.Sprintf("%s/%s", app.Namespace, app.Name)) + var shouldDelete bool + err, shouldDelete = ctrl.syncSparkApplication(fmt.Sprintf("%s/%s", app.Namespace, app.Name)) + assert.Equal(t, shouldDelete, true) assert.Nil(t, err) _, err = ctrl.crdClient.SparkoperatorV1beta2().SparkApplications(app.Namespace).Get(context.TODO(), app.Name, metav1.GetOptions{}) @@ -1594,7 +1631,9 @@ func TestIngressWithSubpathAffectsSparkConfiguration(t *testing.T) { if err != nil { t.Fatal(err) } - err = ctrl.syncSparkApplication(fmt.Sprintf("%s/%s", app.Namespace, app.Name)) + var shouldDelete bool + err, shouldDelete = ctrl.syncSparkApplication(fmt.Sprintf("%s/%s", app.Namespace, app.Name)) + assert.Equal(t, shouldDelete, false) assert.Nil(t, err) deployedApp, err := ctrl.crdClient.SparkoperatorV1beta2().SparkApplications(app.Namespace).Get(context.TODO(), app.Name, metav1.GetOptions{}) if err != nil { @@ -1647,7 +1686,9 @@ func TestIngressWithClassName(t *testing.T) { if err != nil { t.Fatal(err) } - err = ctrl.syncSparkApplication(fmt.Sprintf("%s/%s", app.Namespace, app.Name)) + var shouldDelete bool + err, shouldDelete = ctrl.syncSparkApplication(fmt.Sprintf("%s/%s", app.Namespace, app.Name)) + assert.Equal(t, shouldDelete, false) assert.Nil(t, err) _, err = ctrl.crdClient.SparkoperatorV1beta2().SparkApplications(app.Namespace).Get(context.TODO(), app.Name, metav1.GetOptions{}) if err != nil { diff --git a/pkg/webhook/patch.go b/pkg/webhook/patch.go index a7c20a816..51b69e2ad 100644 --- a/pkg/webhook/patch.go +++ b/pkg/webhook/patch.go @@ -49,6 +49,19 @@ func patchSparkPod(pod *corev1.Pod, app *v1beta2.SparkApplication) []patchOperat patchOps = append(patchOps, addOwnerReference(pod, app)) } + glog.V(2).Infof("Pod %s is starting to add patches", pod.GetObjectMeta().GetName()) + op := addMemoryLimit(pod, app) + if op != nil { + glog.V(2).Infof("Pod %s is adding memoryLimitPatch", pod.GetObjectMeta().GetName()) + patchOps = append(patchOps, *op) + } + + op = overrideMemoryRequest(pod, app) + if op != nil { + glog.V(2).Infof("Pod %s is adding overrideMemoryRequest", pod.GetObjectMeta().GetName()) + patchOps = append(patchOps, *op) + } + patchOps = append(patchOps, addVolumes(pod, app)...) patchOps = append(patchOps, addGeneralConfigMaps(pod, app)...) patchOps = append(patchOps, addSparkConfigMap(pod, app)...) @@ -66,7 +79,7 @@ func patchSparkPod(pod *corev1.Pod, app *v1beta2.SparkApplication) []patchOperat patchOps = append(patchOps, addContainerPorts(pod, app)...) patchOps = append(patchOps, addPriorityClassName(pod, app)...) - op := addSchedulerName(pod, app) + op = addSchedulerName(pod, app) if op != nil { patchOps = append(patchOps, *op) } @@ -126,6 +139,76 @@ func addOwnerReference(pod *corev1.Pod, app *v1beta2.SparkApplication) patchOper return patchOperation{Op: "add", Path: path, Value: value} } +func convertJavaMemoryStringToK8sMemoryString(memory string) string { + if strings.HasSuffix(memory, "Gi") || strings.HasSuffix(memory, "Mi") { + return memory + } + // Convert the memory string from 'g' to 'Gi' and from 'm' to 'Mi. + if strings.HasSuffix(memory, "g") || strings.HasSuffix(memory, "m") { + return strings.ToUpper(memory) + "i" + } + + return memory + +} + +// Function to add memory limit to the container +func addMemoryLimit(pod *corev1.Pod, app *v1beta2.SparkApplication) *patchOperation { + i := findContainer(pod) + if i < 0 { + glog.Warningf("not able to add memory limit as Spark container was not found in pod %s", pod.Name) + return nil + } + var memoryLimit *string + if util.IsDriverPod(pod) { + // Convert the memory limit to bytes. + memoryLimit = app.Spec.Driver.MemoryLimit + } else if util.IsExecutorPod(pod) { + memoryLimit = app.Spec.Executor.MemoryLimit + } + + if memoryLimit == nil { + return nil + } + + limitQunatity, err := resource.ParseQuantity(convertJavaMemoryStringToK8sMemoryString(*memoryLimit)) + if err != nil { + glog.Warningf("failed to parse memory limit %s: %v", *memoryLimit, err) + return nil + } + + glog.V(1).Infof("adding to container %s memory limit: %s.", pod.Name, *memoryLimit) + return &patchOperation{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/resources/limits/memory", i), Value: limitQunatity} +} + +// Function to add memory limit to the container +func overrideMemoryRequest(pod *corev1.Pod, app *v1beta2.SparkApplication) *patchOperation { + i := findContainer(pod) + if i < 0 { + glog.Warningf("not able to override memory request as Spark container was not found in pod %s", pod.Name) + return nil + } + var memoryRequestOverride *string + if util.IsDriverPod(pod) { + memoryRequestOverride = app.Spec.Driver.MemoryRequestOverride + } else if util.IsExecutorPod(pod) { + memoryRequestOverride = app.Spec.Executor.MemoryRequestOverride + } + + if memoryRequestOverride == nil { + return nil + } + + memoryRequest, err := resource.ParseQuantity(convertJavaMemoryStringToK8sMemoryString(*memoryRequestOverride)) + if err != nil { + glog.Warningf("failed to parse memory limit %s: %v", *memoryRequestOverride, err) + return nil + } + + glog.V(1).Infof("adding to container %s override memory request limit: %s.", pod.Name, *memoryRequestOverride) + return &patchOperation{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/resources/requests/memory", i), Value: memoryRequest} +} + func addVolumes(pod *corev1.Pod, app *v1beta2.SparkApplication) []patchOperation { volumes := app.Spec.Volumes