v0.7.4 - External Service (#392)

* TPC-DS: Queries updated * Docs: Read current version number * TPC-DS: More tests * Bexhoma: Check endpoints for service not managed by bexhoma * Bexhoma: Dummy Database Service template * Bexhoma: Monitor daemonset or sidecar (not both) * Bexhoma: Test results does not dump any results themselves * Bexhoma: Show more infos about monitoring processes * Docs: Notes about Cloud databases * Example: Cloud Database PostgreSQL compatible * Docs: Notes about Cloud databases * Docs: Some more tests * Bexhoma: config.monitoring_sut to deactivate monitoring of SUT for services outside of K8s * Bexhoma: Test monitoring tries to fetch sum of memory 5min ago * TPC-H: for Cloud Service * Bexhoma: Test monitoring follows redirects * TPC-H: PostgreSQL loader based on alpine * More tests * Bexhoma: Evaluator alpine-based * Database Service: Dummy template for SUT * Bexhoma: Alpine and Python 3.12.8 as basis for dbmsbenchmarker * Dev: Test for summary markdown * Docs: Notes about cloud databases * Bexhoma: Debug of monitoring health test shows http status code * Bexhoma: Back to debian variant for dbmsbenchmarker (Java issues) - EOL * Docs: Some improvements * Database Service: More tests * Docs: Some improvements * Tests: Clean test cases * Docs: Some improvements * Tests: Clean test cases * Bexhoma: Use simple shell instead of bash to get host infos * Tests: Clean test cases * Docs: YugabyteDB changes to monitoring and host info retrieval * Docs: Status of CockroachDB * Tests: Clean test cases clouds
Beuth-Erdelt · Dec 20, 2024 · b06d20b · b06d20b
1 parent a01dd09
commit b06d20b
Show file tree

Hide file tree

Showing 85 changed files with 21,615 additions and 1,708 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,4 @@ docs/html/*
 api/*
 bexhoma/__pycache__/*
 /cluster-monitoring-default.config
+logs_tests/local/*
diff --git a/benchbase.py b/benchbase.py
@@ -33,7 +33,7 @@
     parser = argparse.ArgumentParser(description=description)
     parser.add_argument('mode', help='start sut, also load data or also run the TPC-C queries', choices=['run', 'start', 'load'])
     parser.add_argument('-aws', '--aws', help='fix components to node groups at AWS', action='store_true', default=False)
-    parser.add_argument('-dbms','--dbms', help='DBMS to load the data', choices=['PostgreSQL', 'MySQL', 'MariaDB', 'YugabyteDB', 'CockroachDB'], default=[], action='append')
+    parser.add_argument('-dbms','--dbms', help='DBMS to load the data', choices=['PostgreSQL', 'MySQL', 'MariaDB', 'YugabyteDB', 'CockroachDB', 'DatabaseService'], default=[], action='append')
     parser.add_argument('-db', '--debug', help='dump debug informations', action='store_true')
     parser.add_argument('-sl',  '--skip-loading', help='do not ingest, start benchmarking immediately', action='store_true', default=False)
     parser.add_argument('-cx', '--context', help='context of Kubernetes (for a multi cluster environment), default is current context', default=None)
@@ -344,23 +344,27 @@
                     if skip_loading:
                         config.loading_deactivated = True
                     config.sut_service_name = "yb-tserver-service"      # fix service name of SUT, because it is not managed by bexhoma
-                    config.sut_container_name = "yb-tserver"            # fix container name of SUT
+                    config.sut_container_name = ''                      # fix container name of SUT
                     def get_worker_pods(self):
                         """
                         Returns a list of all pod names of workers for the current SUT.
                         Default is component name is 'worker' for a bexhoma managed DBMS.
+                        This is used for example to find the pods of the workers in order to get the host infos (CPU, RAM, node name, ...).
                         YugabyteDB: This is yb-tserver-0, -1 etc.
 
                         :return: list of endpoints
                         """
-                        pods_worker = self.experiment.cluster.get_pods(component='worker', configuration=self.configuration, experiment=self.code)
+                        pods_worker = ['yb-tserver-0', 'yb-tserver-1', 'yb-tserver-2']
+                        #pods_worker = self.experiment.cluster.get_pods(app='', component='', configuration='yb-tserver', experiment='')
+                        #print("****************", pods_worker)
                         return pods_worker
-                    #config.get_worker_pods = types.MethodType(get_worker_pods, config)
+                    config.get_worker_pods = types.MethodType(get_worker_pods, config)
                     def create_monitoring(self, app='', component='monitoring', experiment='', configuration=''):
                         """
                         Generate a name for the monitoring component.
+                        This is used in a pattern for promql.
                         Basically this is `{app}-{component}-{configuration}-{experiment}-{client}`.
-                        For YugabyteDB, the service to be monitored is named like 'yb-tserver-'.
+                        For YugabyteDB, the service of the SUT to be monitored is named like 'yb-tserver-'.
 
                         :param app: app the component belongs to
                         :param component: Component, for example sut or monitoring
@@ -379,28 +383,37 @@ def get_worker_endpoints(self):
                         Returns all endpoints of a headless service that monitors nodes of a distributed DBMS.
                         These are IPs of cAdvisor instances.
                         The endpoint list is to be filled in a config of an instance of Prometheus.
-                        For YugabyteDB the service is fixed to be 'bexhoma-service-monitoring-default' and does not depend on the experiment.
+                        By default, the workers can be found by the name of their component (worker-0 etc).
+                        This is neccessary, when we have sidecar containers attached to workers of a distributed dbms.
 
                         :return: list of endpoints
                         """
-                        endpoints = self.experiment.cluster.get_service_endpoints(service_name="bexhoma-service-monitoring-default")
+                        endpoints = []
+                        #name_worker = self.generate_component_name(component='worker', configuration=self.configuration, experiment=self.code)
+                        pods_worker = self.get_worker_pods()
+                        for pod in pods_worker:
+                            #endpoint = '{worker}.{service_sut}'.format(worker=pod, service_sut=name_worker)
+                            endpoint = '{worker}'.format(worker=pod)
+                            endpoints.append(endpoint)
+                            print('Worker Endpoint: {endpoint}'.format(endpoint = endpoint))
                         self.logger.debug("yugabytedb.get_worker_endpoints({})".format(endpoints))
                         return endpoints
-                    #config.get_worker_endpoints = types.MethodType(get_worker_endpoints, config)
+                    config.get_worker_endpoints = types.MethodType(get_worker_endpoints, config)
                     def set_metric_of_config(self, metric, host, gpuid):
                         """
                         Returns a promql query.
                         Parameters in this query are substituted, so that prometheus finds the correct metric.
                         Example: In 'sum(irate(container_cpu_usage_seconds_total{{container_label_io_kubernetes_pod_name=~"(.*){configuration}-{experiment}(.*)", container_label_io_kubernetes_pod_name=~"(.*){configuration}-{experiment}(.*)", container_label_io_kubernetes_container_name="dbms"}}[1m]))'
                         configuration and experiment are placeholders and will be replaced by concrete values.
-                        Here: We do not have a SUT that is specific to the experiment or configuration.
+                        YugabyteDB: We do not have a SUT that is specific to the experiment or configuration. The pod names follow a pattern like yb-tserver and there is no container name.
 
                         :param metric: Parametrized promql query
                         :param host: Name of the host the metrics should be collected from
                         :param gpuid: GPU that the metrics should watch
                         :return: promql query without parameters
                         """
                         metric = metric.replace(', container="dbms"', '')
+                        metric = metric.replace(', container_label_io_kubernetes_container_name="dbms"', '')
                         return metric.format(host=host, gpuid=gpuid, configuration='yb-tserver', experiment='')
                     config.set_metric_of_config = types.MethodType(set_metric_of_config, config)
                     config.set_loading_parameters(
@@ -452,8 +465,6 @@ def set_metric_of_config(self, metric, host, gpuid):
                                         )
                     #print(executor_list)
                     config.add_benchmark_list(executor_list)
-                    #print(executor_list)
-                    config.add_benchmark_list(executor_list)
                     cluster.max_sut = 1 # can only run 1 in same cluster because of fixed service
                 if ("CockroachDB" in args.dbms):# or len(args.dbms) == 0): # not included per default
                     # CockroachDB
@@ -508,9 +519,60 @@ def set_metric_of_config(self, metric, host, gpuid):
                                         )
                     #print(executor_list)
                     config.add_benchmark_list(executor_list)
+                    #cluster.max_sut = 1 # can only run 1 in same cluster because of fixed service
+                if ("DatabaseService" in args.dbms):# or len(args.dbms) == 0): # not included per default
+                    # DatabaseService
+                    name_format = 'DatabaseService-{threads}-{pods}-{target}'
+                    config = configurations.benchbase(experiment=experiment, docker='DatabaseService', configuration=name_format.format(threads=loading_threads, pods=loading_pods, target=loading_target), alias='DatabaseService')
+                    config.monitoring_sut = False # cannot be monitored since outside of K8s
+                    if skip_loading:
+                        config.loading_deactivated = True
+                    config.set_loading_parameters(
+                        PARALLEL = str(loading_pods), # =1
+                        SF = SF,
+                        BENCHBASE_BENCH = type_of_benchmark,#'tpcc',
+                        BENCHBASE_PROFILE = 'postgres',
+                        BEXHOMA_DATABASE = 'postgres',
+                        BEXHOMA_HOST = 'bexhoma-service.perdelt.svc.cluster.local',
+                        #BENCHBASE_TARGET = int(target),
+                        BENCHBASE_TERMINALS = loading_threads_per_pod,
+                        BENCHBASE_TIME = SD,
+                        BENCHBASE_ISOLATION = "TRANSACTION_READ_COMMITTED",
+                        )
+                    config.set_loading(parallel=loading_pods, num_pods=loading_pods)
+                    executor_list = []
+                    for factor_benchmarking in num_benchmarking_target_factors:#range(1, 9):#range(1, 2):#range(1, 15):
+                        benchmarking_target = target_base*factor_benchmarking#4*4096*t
+                        for benchmarking_threads in num_benchmarking_threads:
+                            for benchmarking_pods in num_benchmarking_pods:#[1,2]:#[1,8]:#range(2,5):
+                                for num_executor in list_clients:
+                                    benchmarking_pods_scaled = num_executor*benchmarking_pods
+                                    benchmarking_threads_per_pod = int(benchmarking_threads/benchmarking_pods)
+                                    benchmarking_target_per_pod = int(benchmarking_target/benchmarking_pods)
+                                    """
+                                    print("benchmarking_target", benchmarking_target)
+                                    print("benchmarking_pods", benchmarking_pods)
+                                    print("benchmarking_pods_scaled", benchmarking_pods_scaled)
+                                    print("benchmarking_threads", benchmarking_threads)
+                                    print("benchmarking_threads_per_pod", benchmarking_threads_per_pod)
+                                    print("benchmarking_target_per_pod", benchmarking_target_per_pod)
+                                    """
+                                    executor_list.append(benchmarking_pods_scaled)
+                                    config.add_benchmarking_parameters(
+                                        PARALLEL = str(benchmarking_pods_scaled),
+                                        SF = SF,
+                                        BENCHBASE_BENCH = type_of_benchmark,#'tpcc',
+                                        BENCHBASE_PROFILE = 'postgres',
+                                        BEXHOMA_DATABASE = 'postgres',
+                                        BEXHOMA_HOST = 'bexhoma-service.perdelt.svc.cluster.local',
+                                        BENCHBASE_TARGET = benchmarking_target_per_pod,
+                                        BENCHBASE_TERMINALS = benchmarking_threads_per_pod,
+                                        BENCHBASE_TIME = SD,
+                                        BENCHBASE_ISOLATION = "TRANSACTION_READ_COMMITTED",
+                                        )
                     #print(executor_list)
                     config.add_benchmark_list(executor_list)
-                    cluster.max_sut = 1 # can only run 1 in same cluster because of fixed service
+                    #cluster.max_sut = 1 # can only run 1 in same cluster because of fixed service
     ##############
     ### wait for necessary nodegroups to have planned size
     ##############

diff --git a/bexhoma/clusters.py b/bexhoma/clusters.py
@@ -37,6 +37,7 @@
 import urllib.request
 import urllib.parse
 from pprint import pprint
+from datetime import datetime, timedelta
 
 from dbmsbenchmarker import *
 
@@ -865,9 +866,9 @@ def execute_command_in_pod(self, command, pod='', container='', params=''):
             #pod = self.activepod
         command_clean = command.replace('"','\\"')
         if len(container) > 0:
-            fullcommand = 'kubectl --context {context} exec {pod} --container={container} -- bash -c "{command}"'.format(context=self.context, pod=pod, container=container, command=command_clean)
+            fullcommand = 'kubectl --context {context} exec {pod} --container={container} -- sh -c "{command}"'.format(context=self.context, pod=pod, container=container, command=command_clean)
         else:
-            fullcommand = 'kubectl --context {context} exec {pod} -- bash -c "{command}"'.format(context=self.context, pod=pod, command=command_clean)
+            fullcommand = 'kubectl --context {context} exec {pod} -- sh -c "{command}"'.format(context=self.context, pod=pod, command=command_clean)
             #fullcommand = 'kubectl exec '+self.activepod+' --container=dbms -- bash -c "'+command_clean+'"'
         #print(fullcommand)
         self.logger.debug('testbed.execute_command_in_pod({})'.format(fullcommand))
@@ -1335,22 +1336,28 @@ def test_if_monitoring_healthy(self):
         config_K8s = self.config['credentials']['k8s']
         if 'service_monitoring' in config_K8s['monitor']:
             url = config_K8s['monitor']['service_monitoring'].format(namespace=self.contextdata['namespace'], service="monitoring")
-            query = "node_memory_MemTotal_bytes"
+            query = "sum(node_memory_MemTotal_bytes)"
             safe_query = urllib.parse.quote_plus(query)
             try:
-                self.logger.debug('Test URL {}'.format(url+"query_range?query="+safe_query+"&start=1&end=2&step=1"))
                 #code= urllib.request.urlopen(url+"query_range?query="+safe_query+"&start=1&end=2&step=1").getcode()
                 # curl -ILs www.welt.de | head -n 1|cut -d$' ' -f2
                 pod_dashboard = self.get_dashboard_pod_name()
                 self.logger.debug('Inside pod {}'.format(pod_dashboard))
+                now = datetime.utcnow()
+                start = now - timedelta(seconds=300) # 5 minutes ago
+                end = now - timedelta(seconds=240) # 4 minutes ago
                 cmd = {}
-                command = "curl -is '{}' | head -n 1|cut -d$' ' -f2".format(url+"query_range?query="+safe_query+"&start=1&end=2&step=1")
+                query_url = "{url}query_range?query={safe_query}&start={start}&end={end}&step=60".format(url=url, safe_query=safe_query, start=int(start.timestamp()), end=int(end.timestamp()))
+                self.logger.debug('Test URL {}'.format(query_url))
+                command = "curl -L --max-time 10 -is '{}' | head -n 1|cut -d$' ' -f2".format(query_url)
+                #command = "curl -is '{}' | head -n 1|cut -d$' ' -f2".format(url+"query_range?query="+safe_query+"&start=1&end=2&step=1")
                 self.logger.debug('Command {}'.format(command))
                 #fullcommand = 'kubectl exec '+self.pod_sut+' --container=dbms -- bash -c "'+command+'"'
                 #cores = os.popen(fullcommand).read()
                 stdin, stdout, stderr = self.execute_command_in_pod(pod=pod_dashboard, command=command, container="dashboard")
                 #print("Return", stdout, stderr)
                 status = stdout#os.popen(fullcommand).read()
+                self.logger.debug('Status {}'.format(status))
                 if len(status)>0:
                     #return int(status)
                     #print(int(status))