From 0d150393844ae37a60c4913a04f5b3cf867b5ee0 Mon Sep 17 00:00:00 2001 From: Adam Wegrzynek Date: Wed, 29 Mar 2023 11:45:37 +0200 Subject: [PATCH 01/14] [OMON-662] Store SOR timestamp in InfluxDB as soon as run starts (#317) --- CMakeLists.txt | 2 +- examples/12-KafkaToInfluxDb.cxx | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d45491f4..bc70096a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ endif() # Define project project(Monitoring - VERSION 3.15.4 + VERSION 3.15.5 DESCRIPTION "O2 Monitoring library" LANGUAGES CXX ) diff --git a/examples/12-KafkaToInfluxDb.cxx b/examples/12-KafkaToInfluxDb.cxx index 1c5e97ee..20e9b0a3 100644 --- a/examples/12-KafkaToInfluxDb.cxx +++ b/examples/12-KafkaToInfluxDb.cxx @@ -48,12 +48,13 @@ int main(int argc, char* argv[]) if (stateChange.envinfo().state().empty()) { continue; } - std::cout << stateChange.envinfo().environmentid() << "/" << stateChange.envinfo().runnumber() << " " << change.first << " SOR: " < 1) { From 9a24cf73ccd0db792d8f293d96ebd6d448273d57 Mon Sep 17 00:00:00 2001 From: Adam Wegrzynek Date: Tue, 4 Apr 2023 14:10:02 +0200 Subject: [PATCH 02/14] [OMON-662] Fix send with run method (#318) --- src/Backends/InfluxDB.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Backends/InfluxDB.cxx b/src/Backends/InfluxDB.cxx index 2d272615..70ed84c4 100644 --- a/src/Backends/InfluxDB.cxx +++ b/src/Backends/InfluxDB.cxx @@ -86,7 +86,7 @@ void InfluxDB::send(const Metric& metric) void InfluxDB::sendWithRun(const Metric& metric, const std::string& envId, const std::string& run) { auto serialized = toInfluxLineProtocol(metric); - serialized.insert(serialized.find(',') + 1, "run=" + run + ",envId=" + envId); + serialized.insert(serialized.find(' ') , ",run=" + run + ",envId=" + envId); mTransport->send(std::move(serialized)); } From c37d9370d83ae8297d804b8e73130ce408cf60a9 Mon Sep 17 00:00:00 2001 From: Adam Wegrzynek Date: Wed, 12 Jul 2023 15:36:41 +0200 Subject: [PATCH 03/14] Fix compilation with GCC 12 (#321) --- src/ProcessMonitor.cxx | 1 + src/Transports/KafkaConsumer.cxx | 1 + src/Transports/KafkaProducer.cxx | 1 + 3 files changed, 3 insertions(+) diff --git a/src/ProcessMonitor.cxx b/src/ProcessMonitor.cxx index b7a4dbf6..c28cf6ab 100644 --- a/src/ProcessMonitor.cxx +++ b/src/ProcessMonitor.cxx @@ -18,6 +18,7 @@ #include "MonLogger.h" #include #include +#include #include #include #include diff --git a/src/Transports/KafkaConsumer.cxx b/src/Transports/KafkaConsumer.cxx index 6b28ec1c..e2f8b4ca 100644 --- a/src/Transports/KafkaConsumer.cxx +++ b/src/Transports/KafkaConsumer.cxx @@ -15,6 +15,7 @@ /// #include "KafkaConsumer.h" +#include #include #include "../MonLogger.h" #include "../Exceptions/MonitoringException.h" diff --git a/src/Transports/KafkaProducer.cxx b/src/Transports/KafkaProducer.cxx index 49b4233f..6d8cfd2d 100644 --- a/src/Transports/KafkaProducer.cxx +++ b/src/Transports/KafkaProducer.cxx @@ -15,6 +15,7 @@ /// #include "KafkaProducer.h" +#include #include #include "../MonLogger.h" From 35a3f833f88f48b32af74e4d150126b62af38805 Mon Sep 17 00:00:00 2001 From: Adam Wegrzynek Date: Thu, 13 Jul 2023 14:39:10 +0200 Subject: [PATCH 04/14] [OMON-669] Detect orbitId mismatch (#320) --- CMakeLists.txt | 3 +- examples/14-OrbitId.cxx | 110 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 examples/14-OrbitId.cxx diff --git a/CMakeLists.txt b/CMakeLists.txt index bc70096a..fb2a9225 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ endif() # Define project project(Monitoring - VERSION 3.15.5 + VERSION 3.16.0 DESCRIPTION "O2 Monitoring library" LANGUAGES CXX ) @@ -276,6 +276,7 @@ if(RdKafka_FOUND AND Protobuf_FOUND AND CURL_FOUND) set(PROTO_EXAMPLES examples/12-KafkaToInfluxDb.cxx + examples/14-OrbitId.cxx examples/8-KafkaToHttpServer.cxx ) foreach (example ${PROTO_EXAMPLES}) diff --git a/examples/14-OrbitId.cxx b/examples/14-OrbitId.cxx new file mode 100644 index 00000000..fc7e2883 --- /dev/null +++ b/examples/14-OrbitId.cxx @@ -0,0 +1,110 @@ +/// +/// \file 14-OrbitId.cxx +/// \author Adam Wegrzynek +/// + +#include "../src/Transports/KafkaConsumer.h" +#include "../src/Transports/Unix.h" + +#include +#include +#include +#include +#include +#include + +#include "envs.pb.h" + +using namespace o2::monitoring; + +std::map detectorRunMap; + +std::map referenceOrbitIdMap; + +std::string getValueFromMetric(const std::string& key, const std::string& metric) { + auto indexStart = metric.find(key + "="); + if (indexStart == std::string::npos) { + return {}; + } + auto indexEnd = std::find_if(metric.begin() + indexStart, metric.end(), [](const char& s) { return s == ' ' or s == ','; }); + return std::string(metric.begin() + indexStart + key.size() + 1, indexEnd); +} + + +int main(int argc, char* argv[]) +{ + boost::program_options::options_description desc("Program options"); + desc.add_options() + ("kafka-host", boost::program_options::value()->required(), "Kafka broker hostname"); + boost::program_options::variables_map vm; + boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm); + boost::program_options::notify(vm); + + std::vector topics = {"aliecs.env_list.RUNNING", "cru.link_status"}; + auto kafkaConsumer = std::make_unique(vm["kafka-host"].as() + ":9092", topics, "orbitid"); + auto unixSocket = std::make_unique("/tmp/telegraf.sock"); + for (;;) { + auto messages = kafkaConsumer->pull(); + if (!messages.empty()) { + for (auto& message : messages) { + // handle active runs messages + if (message.first == "aliecs.env_list.RUNNING") { + aliceo2::envs::ActiveRunsList activeRuns; + activeRuns.ParseFromString(message.second); + detectorRunMap.clear(); + for (int i = 0; i < activeRuns.activeruns_size(); i++) { + auto run = activeRuns.activeruns(i).runnumber(); + for (int j = 0; j < activeRuns.activeruns(i).detectors_size(); j++) { + auto detector = activeRuns.activeruns(i).detectors(j); + for (auto& c : detector) c = std::tolower(c); + detectorRunMap.insert({detector, run}); + } + } + if (detectorRunMap.empty()) { + std::cout << "No ongoing runs" << std::endl; + referenceOrbitIdMap.clear(); + } + for (const auto &p : detectorRunMap) { + std::cout << p.first << " belongs to run " << p.second << std::endl; + } + // if SOR + // handle link status messages + } else if (message.first == "cru.link_status") { + auto detector = getValueFromMetric("detector", message.second); + auto orbitId = getValueFromMetric("orbitSor", message.second); + auto status = getValueFromMetric("status", message.second); + if (detector.empty() or orbitId.empty()) { + continue; + } + // if detector is not running + auto detectorInRun = detectorRunMap.find(detector); + if (detectorInRun == detectorRunMap.end()) { + continue; + } + // if link is excluded + if (status != "1i") { + continue; + } + + std::string outputMetric = "orbitIdMismatch" + message.second.substr(message.second.find(","), message.second.find(" ") - message.second.find(",")) + ",run=" + std::to_string(detectorRunMap.at(detector)); + auto referenceOrbit = referenceOrbitIdMap.find(detectorRunMap.at(detector)); + if (referenceOrbit == referenceOrbitIdMap.end()) { + /// wait for trigger + if (orbitId == "0i") { + continue; + } + referenceOrbitIdMap.insert({detectorRunMap.at(detector), orbitId}); + std::cout << "Set reference orbitId for run " << detectorRunMap.at(detector) << ": " << orbitId << std::endl; + unixSocket->send(outputMetric + " reference=" + orbitId); + } + auto referenceOrbitId = referenceOrbitIdMap.at(detectorRunMap.at(detector)); + if (orbitId != referenceOrbitId) { + std::cout << "Abnormal condition for " << detector << "; expected orbitID: " << referenceOrbitId << " but got: " << orbitId << std::endl; + unixSocket->send(outputMetric + " mismatched=" + orbitId); + } + } + } + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } +} From a851c52ba52bdb0963e73675554a6ecafa1605d3 Mon Sep 17 00:00:00 2001 From: Adam Wegrzynek Date: Thu, 13 Jul 2023 15:02:29 +0200 Subject: [PATCH 05/14] [OMON-669] Make sure to install OrbitId example (#322) --- CMakeLists.txt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fb2a9225..9bc9c33e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ endif() # Define project project(Monitoring - VERSION 3.16.0 + VERSION 3.16.1 DESCRIPTION "O2 Monitoring library" LANGUAGES CXX ) @@ -293,6 +293,7 @@ if(RdKafka_FOUND AND Protobuf_FOUND AND CURL_FOUND) endforeach() set_target_properties(8-KafkaToHttpServer PROPERTIES OUTPUT_NAME "o2-monitoring-env-webserver") set_target_properties(12-KafkaToInfluxDb PROPERTIES OUTPUT_NAME "o2-monitoring-kafka-to-influxdb") + set_target_properties(14-OrbitId PROPERTIES OUTPUT_NAME "o2-monitoring-orbitid") endif() #################################### @@ -345,6 +346,11 @@ if(RdKafka_FOUND) install(TARGETS 11-KafkaToWebsocket) endif() + +if(RdKafka_FOUND AND Protobuf_FOUND) + install(TARGETS 14-OrbitId) +endif() + # Create version file include(CMakePackageConfigHelpers) write_basic_package_version_file("${CMAKE_CURRENT_BINARY_DIR}/cmake/MonitoringConfigVersion.cmake" From 98d8b060e7f01d1652e5bbb19c01fa58c2a8d55c Mon Sep 17 00:00:00 2001 From: Adam Wegrzynek Date: Mon, 17 Jul 2023 15:49:50 +0200 Subject: [PATCH 06/14] [OMON-669] Drop all orbit IDs that are 0 (#323) --- CMakeLists.txt | 2 +- examples/14-OrbitId.cxx | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9bc9c33e..93bd375e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ endif() # Define project project(Monitoring - VERSION 3.16.1 + VERSION 3.16.2 DESCRIPTION "O2 Monitoring library" LANGUAGES CXX ) diff --git a/examples/14-OrbitId.cxx b/examples/14-OrbitId.cxx index fc7e2883..d5df6939 100644 --- a/examples/14-OrbitId.cxx +++ b/examples/14-OrbitId.cxx @@ -85,6 +85,11 @@ int main(int argc, char* argv[]) if (status != "1i") { continue; } + // Temporary disable 0s + // TODO: remove + if (orbitId == "0i") { + continue; + } std::string outputMetric = "orbitIdMismatch" + message.second.substr(message.second.find(","), message.second.find(" ") - message.second.find(",")) + ",run=" + std::to_string(detectorRunMap.at(detector)); auto referenceOrbit = referenceOrbitIdMap.find(detectorRunMap.at(detector)); From bb82f39a0364b0333ba5e79daebcfd5831985a6b Mon Sep 17 00:00:00 2001 From: Adam Wegrzynek Date: Thu, 20 Jul 2023 10:47:25 +0200 Subject: [PATCH 07/14] [OMON-669] Temporary remove MCH and MID from orbit ID detection (#324) --- examples/14-OrbitId.cxx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/14-OrbitId.cxx b/examples/14-OrbitId.cxx index d5df6939..9bd6a9e6 100644 --- a/examples/14-OrbitId.cxx +++ b/examples/14-OrbitId.cxx @@ -76,6 +76,12 @@ int main(int argc, char* argv[]) if (detector.empty() or orbitId.empty()) { continue; } + // Temporary disregard MCH and MID + // TODO: remove + if (detector == "mch" || detector == "mid") { + continue; + } + // if detector is not running auto detectorInRun = detectorRunMap.find(detector); if (detectorInRun == detectorRunMap.end()) { From 37b3e9af4d8e421b4a5946395c2065c0cd221124 Mon Sep 17 00:00:00 2001 From: Adam Wegrzynek Date: Thu, 20 Jul 2023 12:01:18 +0200 Subject: [PATCH 08/14] [OMON-691] Use MonLogger in executables (#325) --- CMakeLists.txt | 4 +++- examples/12-KafkaToInfluxDb.cxx | 5 +++-- examples/14-OrbitId.cxx | 9 +++++---- examples/8-KafkaToHttpServer.cxx | 11 ++++++----- src/MonLogger.h | 5 +++-- 5 files changed, 20 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 93bd375e..ea666147 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ endif() # Define project project(Monitoring - VERSION 3.16.2 + VERSION 3.17.0 DESCRIPTION "O2 Monitoring library" LANGUAGES CXX ) @@ -287,7 +287,9 @@ if(RdKafka_FOUND AND Protobuf_FOUND AND CURL_FOUND) Monitoring Boost::program_options protobuf::libprotobuf + $<$:AliceO2::InfoLogger> ) + target_compile_definitions(${example_name} PRIVATE $<$:O2_MONITORING_WITH_INFOLOGGER>) target_include_directories(${example_name} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) install(TARGETS ${example_name}) endforeach() diff --git a/examples/12-KafkaToInfluxDb.cxx b/examples/12-KafkaToInfluxDb.cxx index 20e9b0a3..12116a89 100644 --- a/examples/12-KafkaToInfluxDb.cxx +++ b/examples/12-KafkaToInfluxDb.cxx @@ -14,6 +14,7 @@ #include #include "envs.pb.h" +#include "../src/MonLogger.h" using namespace o2::monitoring; @@ -51,10 +52,10 @@ int main(int argc, char* argv[]) auto metric = Metric{"run_times"}; if (change.first.find("leave") != std::string::npos) { metric.addValue(stateChange.timestamp(), "eor"); - std::cout << stateChange.envinfo().environmentid() << "/" << stateChange.envinfo().runnumber() << " " << change.first << " EOR: " << stateChange.timestamp() << std::endl; + MonLogger::Get() << stateChange.envinfo().environmentid() << "/" << stateChange.envinfo().runnumber() << " " << change.first << " EOR: " << stateChange.timestamp() << MonLogger::End(); } else { metric.addValue(stateChange.envinfo().runtype(), "type").addValue(stateChange.envinfo().enterstatetimestamp(), "sor"); - std::cout << stateChange.envinfo().environmentid() << "/" << stateChange.envinfo().runnumber() << " " << change.first << " SOR: " < 1) { diff --git a/examples/14-OrbitId.cxx b/examples/14-OrbitId.cxx index 9bd6a9e6..ff8627aa 100644 --- a/examples/14-OrbitId.cxx +++ b/examples/14-OrbitId.cxx @@ -14,6 +14,7 @@ #include #include "envs.pb.h" +#include "../src/MonLogger.h" using namespace o2::monitoring; @@ -61,11 +62,11 @@ int main(int argc, char* argv[]) } } if (detectorRunMap.empty()) { - std::cout << "No ongoing runs" << std::endl; + MonLogger::Get() << "No ongoing runs" << MonLogger::End(); referenceOrbitIdMap.clear(); } for (const auto &p : detectorRunMap) { - std::cout << p.first << " belongs to run " << p.second << std::endl; + MonLogger::Get() << p.first << " belongs to run " << p.second << MonLogger::End(); } // if SOR // handle link status messages @@ -105,12 +106,12 @@ int main(int argc, char* argv[]) continue; } referenceOrbitIdMap.insert({detectorRunMap.at(detector), orbitId}); - std::cout << "Set reference orbitId for run " << detectorRunMap.at(detector) << ": " << orbitId << std::endl; + MonLogger::Get() << "Set reference orbitId for run " << detectorRunMap.at(detector) << ": " << orbitId << MonLogger::End(); unixSocket->send(outputMetric + " reference=" + orbitId); } auto referenceOrbitId = referenceOrbitIdMap.at(detectorRunMap.at(detector)); if (orbitId != referenceOrbitId) { - std::cout << "Abnormal condition for " << detector << "; expected orbitID: " << referenceOrbitId << " but got: " << orbitId << std::endl; + MonLogger::Get() << "Abnormal condition for " << detector << "; expected orbitID: " << referenceOrbitId << " but got: " << orbitId << MonLogger::End(); unixSocket->send(outputMetric + " mismatched=" + orbitId); } } diff --git a/examples/8-KafkaToHttpServer.cxx b/examples/8-KafkaToHttpServer.cxx index 03c56141..30487df2 100644 --- a/examples/8-KafkaToHttpServer.cxx +++ b/examples/8-KafkaToHttpServer.cxx @@ -16,12 +16,13 @@ #include "../src/Transports/KafkaConsumer.h" #include "envs.pb.h" +#include "../src/MonLogger.h" namespace beast = boost::beast; namespace http = beast::http; using tcp = boost::asio::ip::tcp; using namespace std::literals::string_literals; - +using o2::monitoring::MonLogger; aliceo2::envs::ActiveRunsList gActiveEnvs; std::mutex gEnvAccess; @@ -198,11 +199,11 @@ void deserializeActiveRuns(const std::string& lastActiveRunMessage) aliceo2::envs::ActiveRunsList activeRuns; activeRuns.ParseFromString(lastActiveRunMessage); if (activeRuns.activeruns_size() == 0) { - std::cout << "Empty active runs" << std::endl; + MonLogger::Get() << "Empty active runs" << MonLogger::End(); } else { for (int i = 0; i < activeRuns.activeruns_size(); i++) { - std::cout << "Active run: " << activeRuns.activeruns(i).runnumber() << " (" - << activeRuns.activeruns(i).environmentid() << ")" << std::endl; + MonLogger::Get() << "Active run: " << activeRuns.activeruns(i).runnumber() << " (" + << activeRuns.activeruns(i).environmentid() << ")" << MonLogger::End(); } } const std::lock_guard lock(gEnvAccess); @@ -220,7 +221,7 @@ int main(int argc, char* argv[]) { boost::program_options::notify(vm); unsigned short port = vm["http-port"].as(); - std::cout << "Using Kafka instance: " << vm["kafka-host"].as() << ":9092 and HTTP server port: " << port << std::endl; + MonLogger::Get() << "Using Kafka instance: " << vm["kafka-host"].as() << ":9092 and HTTP server port: " << port << MonLogger::End(); std::thread webServerThread([&port](){ auto const address = boost::asio::ip::make_address("0.0.0.0"); boost::asio::io_context ioc{1}; diff --git a/src/MonLogger.h b/src/MonLogger.h index 74296c15..4fadcb61 100644 --- a/src/MonLogger.h +++ b/src/MonLogger.h @@ -18,12 +18,13 @@ #define MONITORING_MONINFOLOGGER_H #include -#include -#include #ifdef O2_MONITORING_WITH_INFOLOGGER #include using namespace AliceO2::InfoLogger; +#else +#include +#include #endif namespace o2 From 60450759eb57e76ec6900556106aa9cb60329d8e Mon Sep 17 00:00:00 2001 From: Adam Wegrzynek Date: Thu, 20 Jul 2023 15:59:21 +0200 Subject: [PATCH 09/14] [OMON-691] Change default log severity in examples (#326) --- CMakeLists.txt | 2 +- examples/12-KafkaToInfluxDb.cxx | 1 + examples/14-OrbitId.cxx | 1 + examples/8-KafkaToHttpServer.cxx | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ea666147..7ad43661 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ endif() # Define project project(Monitoring - VERSION 3.17.0 + VERSION 3.17.1 DESCRIPTION "O2 Monitoring library" LANGUAGES CXX ) diff --git a/examples/12-KafkaToInfluxDb.cxx b/examples/12-KafkaToInfluxDb.cxx index 12116a89..7b5c4dbf 100644 --- a/examples/12-KafkaToInfluxDb.cxx +++ b/examples/12-KafkaToInfluxDb.cxx @@ -31,6 +31,7 @@ int main(int argc, char* argv[]) boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm); boost::program_options::notify(vm); + MonLogger::mLoggerSeverity = Severity::Debug; std::vector topics = {"aliecs.env_leave_state.RUNNING", "aliecs.env_state.RUNNING"}; auto kafkaConsumer = std::make_unique(vm["kafka-host"].as() + ":9092", topics, "aliecs-run-times"); auto httpTransport = std::make_unique( diff --git a/examples/14-OrbitId.cxx b/examples/14-OrbitId.cxx index ff8627aa..656e9020 100644 --- a/examples/14-OrbitId.cxx +++ b/examples/14-OrbitId.cxx @@ -41,6 +41,7 @@ int main(int argc, char* argv[]) boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm); boost::program_options::notify(vm); + MonLogger::mLoggerSeverity = Severity::Debug; std::vector topics = {"aliecs.env_list.RUNNING", "cru.link_status"}; auto kafkaConsumer = std::make_unique(vm["kafka-host"].as() + ":9092", topics, "orbitid"); auto unixSocket = std::make_unique("/tmp/telegraf.sock"); diff --git a/examples/8-KafkaToHttpServer.cxx b/examples/8-KafkaToHttpServer.cxx index 30487df2..b1953383 100644 --- a/examples/8-KafkaToHttpServer.cxx +++ b/examples/8-KafkaToHttpServer.cxx @@ -221,6 +221,7 @@ int main(int argc, char* argv[]) { boost::program_options::notify(vm); unsigned short port = vm["http-port"].as(); + MonLogger::mLoggerSeverity = o2::monitoring::Severity::Debug; MonLogger::Get() << "Using Kafka instance: " << vm["kafka-host"].as() << ":9092 and HTTP server port: " << port << MonLogger::End(); std::thread webServerThread([&port](){ auto const address = boost::asio::ip::make_address("0.0.0.0"); From 04287ea2c2e68d9df9098b764fa2547ec7b0032e Mon Sep 17 00:00:00 2001 From: Adam Wegrzynek Date: Fri, 21 Jul 2023 11:01:13 +0200 Subject: [PATCH 10/14] [OMON-669] Disregard calibration run when checking orbit ID (#327) --- CMakeLists.txt | 2 +- examples/14-OrbitId.cxx | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ad43661..2e134779 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ endif() # Define project project(Monitoring - VERSION 3.17.1 + VERSION 3.17.2 DESCRIPTION "O2 Monitoring library" LANGUAGES CXX ) diff --git a/examples/14-OrbitId.cxx b/examples/14-OrbitId.cxx index 656e9020..45f60c0d 100644 --- a/examples/14-OrbitId.cxx +++ b/examples/14-OrbitId.cxx @@ -58,6 +58,11 @@ int main(int argc, char* argv[]) auto run = activeRuns.activeruns(i).runnumber(); for (int j = 0; j < activeRuns.activeruns(i).detectors_size(); j++) { auto detector = activeRuns.activeruns(i).detectors(j); + auto runType = activeRuns.activeruns(i).runtype(); + if (runType.find("calib") != std::string::npos || runType.find("CALIB") != std::string::npos) { + MonLogger::Get() << "Skipping calibration run " << run << MonLogger::End(); + continue; + } for (auto& c : detector) c = std::tolower(c); detectorRunMap.insert({detector, run}); } From ebfa94f216209d435f2bb27b0e4c18c61f3aef99 Mon Sep 17 00:00:00 2001 From: Adam Wegrzynek Date: Fri, 21 Jul 2023 14:33:09 +0200 Subject: [PATCH 11/14] [OMON-669] Drop wrongly reported values during BAR read (#328) --- CMakeLists.txt | 2 +- examples/14-OrbitId.cxx | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e134779..2976a09a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ endif() # Define project project(Monitoring - VERSION 3.17.2 + VERSION 3.17.3 DESCRIPTION "O2 Monitoring library" LANGUAGES CXX ) diff --git a/examples/14-OrbitId.cxx b/examples/14-OrbitId.cxx index 45f60c0d..b28eb2dc 100644 --- a/examples/14-OrbitId.cxx +++ b/examples/14-OrbitId.cxx @@ -104,6 +104,11 @@ int main(int argc, char* argv[]) continue; } + // Drop wrongly reported values during BAR read (0xFFFFFFFF) + if (orbitId == "4294967295i") { + continue; + } + std::string outputMetric = "orbitIdMismatch" + message.second.substr(message.second.find(","), message.second.find(" ") - message.second.find(",")) + ",run=" + std::to_string(detectorRunMap.at(detector)); auto referenceOrbit = referenceOrbitIdMap.find(detectorRunMap.at(detector)); if (referenceOrbit == referenceOrbitIdMap.end()) { From 9f7aa552c506fdafa08b07c9e2ddcb81661c5171 Mon Sep 17 00:00:00 2001 From: Adam Wegrzynek Date: Fri, 1 Sep 2023 14:50:49 +0200 Subject: [PATCH 12/14] [OMON-699] Attempt to scrape metrics from ODC endpoint (#329) --- CMakeLists.txt | 49 ++++++ examples/15-ODC.cxx | 249 ++++++++++++++++++++++++++++++ examples/8-KafkaToHttpServer.cxx | 87 +---------- examples/helpers/HttpConnection.h | 90 +++++++++++ proto/odc.proto | 226 +++++++++++++++++++++++++++ 5 files changed, 617 insertions(+), 84 deletions(-) create mode 100644 examples/15-ODC.cxx create mode 100644 examples/helpers/HttpConnection.h create mode 100644 proto/odc.proto diff --git a/CMakeLists.txt b/CMakeLists.txt index 2976a09a..471e08b1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,6 +79,7 @@ find_package(CURL MODULE) find_package(RdKafka CONFIG) find_package(InfoLogger CONFIG) find_package(Protobuf) +find_package(gRPC CONFIG) #################################### # Set OUTPUT vars @@ -298,6 +299,50 @@ if(RdKafka_FOUND AND Protobuf_FOUND AND CURL_FOUND) set_target_properties(14-OrbitId PROPERTIES OUTPUT_NAME "o2-monitoring-orbitid") endif() +#################################### +# gRPC +#################################### +if(Protobuf_FOUND AND gRPC_FOUND) + set(PROTO_FILE ${CMAKE_CURRENT_SOURCE_DIR}/proto/odc.proto) + get_filename_component(PROTO_OUTPUT_NAME ${PROTO_FILE} NAME_WE) + get_filename_component(PROTO_FILE_PREFIX ${PROTO_FILE} PATH) + set(PROTO_CPP_OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${PROTO_OUTPUT_NAME}.pb.cc) + set(GRPC_CPP_OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${PROTO_OUTPUT_NAME}.grpc.pb.cc) + + add_custom_command( + OUTPUT "${PROTO_CPP_OUTPUT}" + COMMAND protobuf::protoc + ARGS --proto_path ${PROTO_FILE_PREFIX} + --cpp_out ${CMAKE_CURRENT_BINARY_DIR} + ${PROTO_OUTPUT_NAME}.proto + DEPENDS ${PROTO_FILE} + COMMENT "Running protoc on ${PROTO_FILE}" + VERBATIM) + + add_custom_command( + OUTPUT "${GRPC_CPP_OUTPUT}" + COMMAND protobuf::protoc + ARGS --proto_path ${PROTO_FILE_PREFIX} + --grpc_out=${CMAKE_CURRENT_BINARY_DIR} + --plugin=protoc-gen-grpc=$ + ${PROTO_OUTPUT_NAME}.proto + DEPENDS ${PROTO_FILE} + COMMENT "Running protoc/gRPC on ${PROTO_FILE}" + VERBATIM) + + set(example examples/15-ODC.cxx) + get_filename_component(example_name ${example} NAME) + string(REGEX REPLACE ".cxx" "" example_name ${example_name}) + add_executable(${example_name} ${example} ${PROTO_CPP_OUTPUT} ${GRPC_CPP_OUTPUT}) + target_link_libraries(${example_name} PRIVATE + gRPC::grpc++ + protobuf::libprotobuf + Boost::program_options + ) + target_include_directories(${example_name} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + set_target_properties(15-ODC PROPERTIES OUTPUT_NAME "o2-monitoring-odc") +endif() + #################################### # Tests #################################### @@ -353,6 +398,10 @@ if(RdKafka_FOUND AND Protobuf_FOUND) install(TARGETS 14-OrbitId) endif() +if(Protobuf_FOUND AND gRPC_FOUND) + install(TARGETS 15-ODC) +endif() + # Create version file include(CMakePackageConfigHelpers) write_basic_package_version_file("${CMAKE_CURRENT_BINARY_DIR}/cmake/MonitoringConfigVersion.cmake" diff --git a/examples/15-ODC.cxx b/examples/15-ODC.cxx new file mode 100644 index 00000000..f6c0cca0 --- /dev/null +++ b/examples/15-ODC.cxx @@ -0,0 +1,249 @@ +/// +/// \file 15-ODC.cxx +/// \author Adam Wegrzynek +/// + +#include +#include +#include "odc.grpc.pb.h" +#include "helpers/HttpConnection.h" +#include +#include +#include +#include "../src/MonLogger.h" + +using grpc::Channel; +using grpc::ClientContext; +using grpc::Status; + +using odc::ODC; +using odc::StatusRequest; +using odc::StatusReply; + +using odc::StateRequest; +using odc::StateReply; +using odc::GeneralReply; +using o2::monitoring::MonLogger; + +std::mutex gMapAccess; + +struct OdcStats { + int EpnCount; + int FailedTasks; + unsigned int RecoTasks; + unsigned int CalibTasks; + std::unordered_map TasksPerCalib; + std::unordered_map FailedTasksPerCalib; + std::unordered_map CalibNames; + std::string State; +}; + +std::map gStats; + +void httpServer(tcp::acceptor& acceptor, tcp::socket& socket) { + acceptor.async_accept(socket, [&](beast::error_code ec) { + if (!ec) { + auto connection = std::make_shared(std::move(socket)); + connection->addCallback("SHOW+RETENTION+POLICIES", + [](http::request& /*request*/, http::response& response) { + response.set(http::field::content_type, "application/json"); + beast::ostream(response.body()) << "{}\n"; + }); + connection->addCallback("SHOW+measurements", + [](http::request& /*request*/, http::response& response) { + response.set(http::field::content_type, "application/json"); + beast::ostream(response.body()) << R"({"results":[{"statement_id":0,"series":[{"name":"measurements","columns":["name"],"values":[["odc"]]}]}]}\n)"; + }); + connection->addCallback("SHOW+TAG+VALUES+FROM+calibs+WHERE+partitionid", + [](http::request& request, http::response& response) { + std::string jsonPrefix = R"({"results": [{"statement_id": 0, "series": [{"name": "odc_calibs", "columns": ["key", "value"], "values": [)"; + std::string jsonSuffix = R"(]}]}]})"; + response.set(http::field::content_type, "application/json"); + std::string calibJson; + std::string id = std::string(request.target().substr(request.target().find("WHERE+partitionid+%3D+") + 22)); + const std::lock_guard lock(gMapAccess); + if (gStats.find(id) != gStats.end()) { + for (auto const& calib : gStats.at(id).TasksPerCalib) { + calibJson += "[\"calib\", \"" + calib.first + "\"],"; + } + } + if (!calibJson.empty()) { + calibJson.pop_back(); + } + beast::ostream(response.body()) << jsonPrefix << calibJson << jsonSuffix << '\n'; + }); + connection->addCallback("odc_status+WHERE+partitionid", + [](http::request& request, http::response& response) { + std::string jsonPrefix = R"({"results":[{"statement_id":0,"series":[{"name":"odc","columns":["time","State","EPN count","Failed tasks", "Calib tasks", "Reco tasks"],"values":[)"; + std::string jsonSuffix = R"(]}]}]})"; + response.set(http::field::content_type, "application/json"); + std::string id = std::string(request.target().substr(request.target().find("WHERE+partitionid+%3D+") + 22)); + std::string odcStatJson; + const std::lock_guard lock(gMapAccess); + if (gStats.find(id) != gStats.end()) { + odcStatJson += "[" + std::to_string(0) + ", \"" + + gStats.at(id).State + "\", \"" + + std::to_string(gStats.at(id).EpnCount) + "\", \"" + + std::to_string(gStats.at(id).FailedTasks) + "\", \"" + + std::to_string(gStats.at(id).CalibTasks) + "\", \"" + + std::to_string(gStats.at(id).RecoTasks) + "\"]"; + } + beast::ostream(response.body()) << jsonPrefix << odcStatJson << jsonSuffix << '\n'; + }); + connection->addCallback("calib_tasks+WHERE+calib", + [](http::request& request, http::response& response) { + std::string jsonPrefix = R"({"results":[{"statement_id":0,"series":[{"name":"calib_tasks","columns":["time","Name", "Total","Failed"],"values":[)"; + std::string jsonSuffix = R"(]}]}]})"; + response.set(http::field::content_type, "application/json"); + std::string calib = std::string(request.target().substr(request.target().find("WHERE+calib+%3D+") + 16)); + std::string calibTasksJson; + const std::lock_guard lock(gMapAccess); + calibTasksJson += "[" + std::to_string(0); + for (const auto& run : gStats) { + if (run.second.TasksPerCalib.find(calib) != run.second.TasksPerCalib.end()) { + calibTasksJson += ", \""; + if (run.second.CalibNames.find(calib) != run.second.CalibNames.end()) { + calibTasksJson += run.second.CalibNames.at(calib) + "\","; + } else { + calibTasksJson += "\","; + } + calibTasksJson += std::to_string(run.second.TasksPerCalib.at(calib)); + if (run.second.FailedTasksPerCalib.find(calib) != run.second.FailedTasksPerCalib.end()) { + calibTasksJson += "," + std::to_string(run.second.FailedTasksPerCalib.at(calib)); + } else { + calibTasksJson += ",0"; + } + } + } + calibTasksJson += "]"; + beast::ostream(response.body()) << jsonPrefix << calibTasksJson << jsonSuffix << '\n'; + }); + connection->start(); + } + httpServer(acceptor, socket); + }); +} + +class OdcClient { + public: + OdcClient(std::shared_ptr channel) : mStub(ODC::NewStub(channel)) {} + void getStatus() { + gStats.clear(); + StatusRequest request; + request.set_running(true); + StatusReply reply; + ClientContext context; + Status status = mStub->Status(&context, request, &reply); + if (status.ok()) { + MonLogger::Get() << "Status call OK" << MonLogger::End(); + for (int i = 0; i < reply.partitions_size(); i++) { + auto partitionId = reply.partitions(i).partitionid(); + OdcStats stats; + stats.State = reply.partitions(i).state(); + getRunState(partitionId, stats); + const std::lock_guard lock(gMapAccess); + gStats.insert({partitionId, stats}); + } + } else { + std::cout << status.error_code() << ": " << status.error_message() << std::endl; + } + } + void getRunState(const std::string& partitionId, OdcStats& stats) { + StateRequest request; + request.set_detailed(true); + request.set_partitionid(partitionId); + StateReply reply; + ClientContext context; + Status status = mStub->GetState(&context, request, &reply); + if (status.ok()) { + MonLogger::Get() << "State call for " << partitionId << " OK" << MonLogger::End(); + unsigned int failedCount = 0; + std::unordered_set uniqueEpns{}; + std::unordered_set calibCollections{}; + unsigned int recoTasks = 0; + unsigned int calibTasks = 0; + std::regex rReco("_reco[0-9]+_"); + std::regex rCalib("_calib[0-9]+_"); + for (int i = 0; i < reply.devices_size(); i++) { + if (reply.devices(i).state() == "ERROR") { + failedCount++; + } + uniqueEpns.insert(reply.devices(i).host()); + if (std::regex_search(reply.devices(i).path(), rReco)) { + recoTasks++; + } + if (std::regex_search(reply.devices(i).path(), rCalib)) { + calibTasks++; + const auto& path = reply.devices(i).path(); + auto calibIdx = path.find("_calib"); + auto calib = path.substr(calibIdx + 1, path.size()-calibIdx-3); + auto calibName = path.substr(path.find_last_of('/') + 1, calibIdx - path.find_last_of('/') - 1); + if (calibName.find("aggregator-proxy-") != std::string::npos) { + stats.CalibNames.insert({calib, calibName.substr(calibName.find_last_of('-') + 1)}); + } + auto it = stats.TasksPerCalib.find(calib); + if (it != stats.TasksPerCalib.end()) { + it->second++; + } + else { + stats.TasksPerCalib.insert({calib, 1}); + } + if (reply.devices(i).state() == "ERROR") { + auto it = stats.FailedTasksPerCalib.find(calib); + if (it != stats.FailedTasksPerCalib.end()) { + it->second++; + } + else { + stats.FailedTasksPerCalib.insert({calib, 1}); + } + } + } + } + const std::lock_guard lock(gMapAccess); + stats.RecoTasks = recoTasks; + stats.CalibTasks = calibTasks; + stats.EpnCount = uniqueEpns.size(); + stats.FailedTasks = failedCount; + } else { + std::cout << status.error_code() << ": " << status.error_message() << std::endl; + } + } + + private: + std::unique_ptr mStub; +}; + + +int main(int argc, char* argv[]) { + boost::program_options::options_description desc("Program options"); + desc.add_options() + ("odc-host", boost::program_options::value()->required(), "ODC hostname") + ("odc-port", boost::program_options::value()->required(), "ODC port") + ("http-port", boost::program_options::value()->default_value(8088), "HTTP server bind port"); + boost::program_options::variables_map vm; + boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm); + boost::program_options::notify(vm); + unsigned short port = vm["http-port"].as(); + + MonLogger::mLoggerSeverity = o2::monitoring::Severity::Debug; + MonLogger::Get() << "Connected to ODC server: " << vm["odc-host"].as() << ":" << vm["odc-port"].as() << "; serving HTTP on port: " << port << MonLogger::End(); + std::thread webServerThread([&port](){ + auto const address = boost::asio::ip::make_address("0.0.0.0"); + boost::asio::io_context ioc{1}; + tcp::acceptor acceptor{ioc, {address, port}}; + tcp::socket socket{ioc}; + httpServer(acceptor, socket); + ioc.run(); + }); + grpc::ChannelArguments args; + args.SetMaxReceiveMessageSize(20*1024*1024); + OdcClient client(grpc::CreateCustomChannel( + vm["odc-host"].as() + ":" + std::to_string(vm["odc-port"].as()), + grpc::InsecureChannelCredentials(), + args + )); + for (;;) { + client.getStatus(); + std::this_thread::sleep_for(std::chrono::seconds(15)); + } +} diff --git a/examples/8-KafkaToHttpServer.cxx b/examples/8-KafkaToHttpServer.cxx index b1953383..e5b30c8f 100644 --- a/examples/8-KafkaToHttpServer.cxx +++ b/examples/8-KafkaToHttpServer.cxx @@ -1,9 +1,5 @@ #define BOOST_BEAST_USE_STD_STRING_VIEW -#include -#include -#include -#include #include #include #include @@ -14,96 +10,17 @@ #include #include +#include "helpers/HttpConnection.h" #include "../src/Transports/KafkaConsumer.h" #include "envs.pb.h" #include "../src/MonLogger.h" -namespace beast = boost::beast; -namespace http = beast::http; -using tcp = boost::asio::ip::tcp; using namespace std::literals::string_literals; using o2::monitoring::MonLogger; aliceo2::envs::ActiveRunsList gActiveEnvs; std::mutex gEnvAccess; - -class httpConnection : public std::enable_shared_from_this -{ -public: - httpConnection(tcp::socket socket) : mSocket(std::move(socket)) {} - - // Initiate the asynchronous operations associated with the connection. - void start() { - readRequest(); - checkDeadline(); - } - - void addCallback(std::string_view path, std::function& request, http::response& response)> callback) { - mCallbacks.insert({path, callback}); - } - -private: - tcp::socket mSocket; - beast::flat_buffer mBuffer{8192}; - http::request mRequest; - http::response mResponse; - boost::asio::steady_timer mDeadline{mSocket.get_executor(), std::chrono::seconds(5)}; - std::map& request, http::response& response)>> mCallbacks; - - // Asynchronously receive a complete request message. - void readRequest() { - auto self = shared_from_this(); - http::async_read(mSocket, mBuffer, mRequest, [self](beast::error_code ec, std::size_t bytes_transferred) { - boost::ignore_unused(bytes_transferred); - if (!ec) self->processRequest(); - }); - } - - // Determine what needs to be done with the request message. - void processRequest() { - mResponse.version(mRequest.version()); - mResponse.keep_alive(false); - mResponse.result(http::status::ok); - createResponse(); - writeResponse(); - } - - // Construct a response message based on the program state. - void createResponse() { - for (const auto& [key, value] : mCallbacks) { - if (mRequest.target().find(key) != std::string_view::npos) { - value(mRequest, mResponse); - return; - } - } - mResponse.result(http::status::not_found); - mResponse.set(http::field::content_type, "text/plain"); - beast::ostream(mResponse.body()) << "Not found\r\n"; - } - - // Asynchronously transmit the response message. - void writeResponse() { - auto self = shared_from_this(); - mResponse.content_length(mResponse.body().size()); - http::async_write(mSocket, mResponse, [self](beast::error_code ec, std::size_t) { - self->mSocket.shutdown(tcp::socket::shutdown_send, ec); - self->mDeadline.cancel(); - }); - } - - // Check whether we have spent enough time on this connection. - void checkDeadline() { - auto self = shared_from_this(); - mDeadline.async_wait( - [self](beast::error_code ec) { - if(!ec) { - self->mSocket.close(ec); - } - }); - } -}; - // "Loop" forever accepting new connections. void httpServer(tcp::acceptor& acceptor, tcp::socket& socket) { acceptor.async_accept(socket, [&](beast::error_code ec) { @@ -126,6 +43,8 @@ void httpServer(tcp::acceptor& acceptor, tcp::socket& socket) { } if (!envsJson.empty()) { envsJson.pop_back(); + } else { + envsJson += "[\"run\", \"0\"]"; } beast::ostream(response.body()) << jsonPrefix << envsJson << jsonSuffix << '\n'; }); diff --git a/examples/helpers/HttpConnection.h b/examples/helpers/HttpConnection.h new file mode 100644 index 00000000..32e520bf --- /dev/null +++ b/examples/helpers/HttpConnection.h @@ -0,0 +1,90 @@ +#define BOOST_BEAST_USE_STD_STRING_VIEW + +#include +#include +#include +#include +#include +#include + +namespace beast = boost::beast; +namespace http = beast::http; +using tcp = boost::asio::ip::tcp; + + +/// Helper class to represent HTTP connection +class httpConnection : public std::enable_shared_from_this +{ +public: + httpConnection(tcp::socket socket) : mSocket(std::move(socket)) {} + + // Initiate the asynchronous operations associated with the connection. + void start() { + readRequest(); + checkDeadline(); + } + + void addCallback(std::string_view path, std::function& request, http::response& response)> callback) { + mCallbacks.insert({path, callback}); + } + +private: + tcp::socket mSocket; + beast::flat_buffer mBuffer{8192}; + http::request mRequest; + http::response mResponse; + boost::asio::steady_timer mDeadline{mSocket.get_executor(), std::chrono::seconds(5)}; + std::map& request, http::response& response)>> mCallbacks; + + // Asynchronously receive a complete request message. + void readRequest() { + auto self = shared_from_this(); + http::async_read(mSocket, mBuffer, mRequest, [self](beast::error_code ec, std::size_t bytes_transferred) { + boost::ignore_unused(bytes_transferred); + if (!ec) self->processRequest(); + }); + } + + // Determine what needs to be done with the request message. + void processRequest() { + mResponse.version(mRequest.version()); + mResponse.keep_alive(false); + mResponse.result(http::status::ok); + createResponse(); + writeResponse(); + } + + // Construct a response message based on the program state. + void createResponse() { + for (const auto& [key, value] : mCallbacks) { + if (mRequest.target().find(key) != std::string_view::npos) { + value(mRequest, mResponse); + return; + } + } + mResponse.result(http::status::not_found); + mResponse.set(http::field::content_type, "text/plain"); + beast::ostream(mResponse.body()) << "Not found\r\n"; + } + + // Asynchronously transmit the response message. + void writeResponse() { + auto self = shared_from_this(); + mResponse.content_length(mResponse.body().size()); + http::async_write(mSocket, mResponse, [self](beast::error_code ec, std::size_t) { + self->mSocket.shutdown(tcp::socket::shutdown_send, ec); + self->mDeadline.cancel(); + }); + } + + // Check whether we have spent enough time on this connection. + void checkDeadline() { + auto self = shared_from_this(); + mDeadline.async_wait( + [self](beast::error_code ec) { + if(!ec) { + self->mSocket.close(ec); + } + }); + } +}; diff --git a/proto/odc.proto b/proto/odc.proto new file mode 100644 index 00000000..1b26d848 --- /dev/null +++ b/proto/odc.proto @@ -0,0 +1,226 @@ +syntax = "proto3"; + +package odc; + +// The ODC service definition. +// For details also see https://github.com/FairRootGroup/ODC#command-mapping +service ODC { + // Creates a new DDS session or attaches to an existing DDS session. + rpc Initialize (InitializeRequest) returns (GeneralReply) {} + // Submits DDS agents (deploys a dynamic cluster) according to a specified computing resources. + // Can be called multiple times in order to submit more DDS agents (allocate more resources). + rpc Submit (SubmitRequest) returns (GeneralReply) {} + // Activates a given topology. + rpc Activate (ActivateRequest) returns (GeneralReply) {} + // Run request combines Initialize, Submit and Activate into a single request. + // Run request always creates a new DDS session. + rpc Run (RunRequest) returns (GeneralReply) {} + // Updates a topology (up or down scale number of tasks or any other topology change). + // It consists of 3 commands: Reset, Activate and Configure. + // Can be called multiple times. + rpc Update (UpdateRequest) returns (GeneralReply) {} + // Transitions devices into Ready state. + rpc Configure (ConfigureRequest) returns (StateReply) {} + // Changes devices configuration. + rpc SetProperties (SetPropertiesRequest) returns (GeneralReply) {} + // Get current aggregated state of devices. + rpc GetState (StateRequest) returns (StateReply) {} + // Transition devices into Running state. + rpc Start (StartRequest) returns (StateReply) {} + // Transitions devices into Ready state. + rpc Stop (StopRequest) returns (StateReply) {} + // Transitions devices into Idle state. + rpc Reset (ResetRequest) returns (StateReply) {} + // Shuts devices down via End transition. + rpc Terminate (TerminateRequest) returns (StateReply) {} + // Shutdown DDS session. + rpc Shutdown (ShutdownRequest) returns (GeneralReply) {} + // Status request. + rpc Status (StatusRequest) returns (StatusReply) {} +} + +// Request status +enum ReplyStatus { + UNKNOWN = 0; // Status is unknown + SUCCESS = 1; // Request performed successfully + ERROR = 2; // Failed to perform request +} + +// Session status as defined by DDS +enum SessionStatus { + UNKNOWN_ = 0; // Status is unknown + RUNNING = 1; // DDS session is running + STOPPED = 2; // DDS session is stopped +} + +// General error +message Error { + string msg = 1; // Detailed error message + int32 code = 2; // Error code. + // TODO: Make error codes specific for each concrete request and/or error type. + // TODO: Add link to a documented error codes. +} + +// General reply to requests +message GeneralReply { + string msg = 1; // Detailed reply message + ReplyStatus status = 2; // Request status code (UNKNOWN, SUCCESS, ERROR) + Error error = 3; // If status is ERROR than this field contains error description otherwise it's empty + int32 exectime = 4; // Request execution time in ms + string partitionid = 5; // Partition ID from ECS + string sessionid = 6; // DDS session ID + string state = 7; // If successful and applicable to a request then contains an aggregated FairMQ device state, otherwise UNDEFINED. + uint64 runnr = 8; // Run number from ECS (optional) + repeated string hosts = 9; // Where applicable, provides a list of used hosts (Submit/Run requests) +} + +// Device information +// Runtime task ID and path are the same as in DDS. +// To get task details use DDS Topology API. +message Device { + uint64 id = 1; // Runtime task ID (same as in DDS) + string state = 2; // FairMQ device state as string + string path = 3; // Runtime task path (same as in DDS) + bool ignored = 4; // Device was stopped and set to be ignored + string host = 5; // Host where the task runs +} + +// Device change/get state request +message StateRequest { + string partitionid = 1; // Partition ID from ECS + uint64 runnr = 4; // Run number from ECS + uint32 timeout = 5; // Request timeout in sec. If not set or 0 than default is used. + string path = 2; // Task path in the DDS topology. Can be a regular expression. + bool detailed = 3; // If true then a list of affected devices is populated in the reply. +} + +// Device change/get state reply +message StateReply { + GeneralReply reply = 1; // General reply. See GeneralReply message for details. + repeated Device devices = 2; // If detailed reply is requested then this field contains a list of affected devices otherwise it's empty. +} + +// Status of each partition +message PartitionStatus { + string partitionid = 1; // Partition ID from ECS + uint64 runnr = 5; // Run number from ECS + uint32 timeout = 6; // Request timeout in sec. If not set or 0 than default is used. + string sessionid = 2; // DDS session ID + SessionStatus status = 3; // DDS session status + string state = 4; // If successful and applicable to a request then contains an aggregated FairMQ device state, otherwise UNDEFINED. +} + +// ODC status reply +message StatusReply { + string msg = 1; // Detailed reply message + ReplyStatus status = 2; // Request status code (UNKNOWN, SUCCESS, ERROR) + Error error = 3; // If status is ERROR than this field contains error description otherwise it's empty + int32 exectime = 4; // Request execution time in ms + repeated PartitionStatus partitions = 5; // Status of each partition +} + +// Initialize request +message InitializeRequest { + string partitionid = 1; // Partition ID from ECS + uint64 runnr = 3; // Run number from ECS + uint32 timeout = 4; // Request timeout in sec. If not set or 0 than default is used. + string sessionid = 2; // DDS session ID. If session ID is provided that ODC connects to an existing DDS session. If session ID is an empty string that a new DDS session is created. +} + +// Submit request +message SubmitRequest { + string partitionid = 1; // Partition ID from ECS + uint64 runnr = 4; // Run number from ECS + uint32 timeout = 5; // Request timeout in sec. If not set or 0 than default is used. + string plugin = 2; // Name of the resource plugin registered in odc-server + string resources = 3; // Resource description +} + +// Activate request +message ActivateRequest { + string partitionid = 1; // Partition ID from ECS + uint64 runnr = 5; // Run number from ECS + uint32 timeout = 6; // Request timeout in sec. If not set or 0 than default is used. + // Either `topology`, `content` or `script` has to be set. If all or none is set then an error is returned. + string topology = 2; // Filepath to the XML DDS topology file + string content = 3; // Content of the XML DDS topology + string script = 4; // Shell commands to be executed by ODC in order to generate content of the XML DDS topology +} + +// Run request +message RunRequest { + string partitionid = 1; // Partition ID from ECS + uint64 runnr = 7; // Run number from ECS + uint32 timeout = 8; // Request timeout in sec. If not set or 0 than default is used. + // Either `topology`, `content` or `script` has to be set. If all or none is set then an error is returned. + string topology = 2; // Filepath to the XML DDS topology file + string content = 5; // Content of the XML DDS topology + string script = 6; // Shell commands to be executed by ODC in order to generate content of the XML DDS topology + string plugin = 3; // Name of the resource plugin registered in odc-server + string resources = 4; // Resource description + bool extractTopoResources = 9; // extract required resources from the topology file only (plugin & resources fields are ignored) +} + +// Update request +message UpdateRequest { + string partitionid = 1; // Partition ID from ECS + uint64 runnr = 5; // Run number from ECS + uint32 timeout = 6; // Request timeout in sec. If not set or 0 than default is used. + // Either `topology`, `content` or `script` has to be set. If all or none is set then an error is returned. + string topology = 2; // Filepath to the XML DDS topology file + string content = 3; // Content of the XML DDS topology + string script = 4; // Shell commands to be executed by ODC in order to generate content of the XML DDS topology +} + +// Shutdown request +message ShutdownRequest { + string partitionid = 1; // Partition ID from ECS + uint64 runnr = 2; // Run number from ECS + uint32 timeout = 3; // Request timeout in sec. If not set or 0 than default is used. +} + +// Key-Value property +message Property { + string key = 1; // Property key + string value = 2; // Property value +} + +// Set properties request +message SetPropertiesRequest { + string partitionid = 1; // Partition ID from ECS + uint64 runnr = 4; // Run number from ECS + uint32 timeout = 5; // Request timeout in sec. If not set or 0 than default is used. + string path = 2; // Task path in the DDS topology. Can be a regular expression. + repeated Property properties = 3; // List of properties to be set +} + +// Configure request +message ConfigureRequest { + StateRequest request = 1; // State change request. See StateRequest for details. +} + +// Start request +message StartRequest { + StateRequest request = 1; // State change request. See StateRequest for details. +} + +// Stop request +message StopRequest { + StateRequest request = 1; // State change request. See StateRequest for details. +} + +// Reset request +message ResetRequest { + StateRequest request = 1; // State change request. See StateRequest for details. +} + +// Terminate request +message TerminateRequest { + StateRequest request = 1; // State change request. See StateRequest for details. +} + +// Status request +message StatusRequest { + bool running = 1; // Select only running DDS sessions +} + From 2daddd29a69cd517cff2d186d1631f4d867ce82c Mon Sep 17 00:00:00 2001 From: Adam Wegrzynek Date: Wed, 6 Sep 2023 16:10:57 +0200 Subject: [PATCH 13/14] [OMON-647] Scrape AliECS endpoint for inactive tasks (#330) --- CMakeLists.txt | 76 +++--- examples/16-AliECS.cxx | 86 +++++++ proto/o2control.proto | 525 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 651 insertions(+), 36 deletions(-) create mode 100644 examples/16-AliECS.cxx create mode 100644 proto/o2control.proto diff --git a/CMakeLists.txt b/CMakeLists.txt index 471e08b1..3a583a92 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -303,44 +303,48 @@ endif() # gRPC #################################### if(Protobuf_FOUND AND gRPC_FOUND) - set(PROTO_FILE ${CMAKE_CURRENT_SOURCE_DIR}/proto/odc.proto) - get_filename_component(PROTO_OUTPUT_NAME ${PROTO_FILE} NAME_WE) - get_filename_component(PROTO_FILE_PREFIX ${PROTO_FILE} PATH) - set(PROTO_CPP_OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${PROTO_OUTPUT_NAME}.pb.cc) - set(GRPC_CPP_OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${PROTO_OUTPUT_NAME}.grpc.pb.cc) - - add_custom_command( - OUTPUT "${PROTO_CPP_OUTPUT}" - COMMAND protobuf::protoc - ARGS --proto_path ${PROTO_FILE_PREFIX} - --cpp_out ${CMAKE_CURRENT_BINARY_DIR} - ${PROTO_OUTPUT_NAME}.proto - DEPENDS ${PROTO_FILE} - COMMENT "Running protoc on ${PROTO_FILE}" - VERBATIM) - - add_custom_command( - OUTPUT "${GRPC_CPP_OUTPUT}" - COMMAND protobuf::protoc - ARGS --proto_path ${PROTO_FILE_PREFIX} - --grpc_out=${CMAKE_CURRENT_BINARY_DIR} - --plugin=protoc-gen-grpc=$ - ${PROTO_OUTPUT_NAME}.proto - DEPENDS ${PROTO_FILE} - COMMENT "Running protoc/gRPC on ${PROTO_FILE}" - VERBATIM) + set(PROTO_FILES ${CMAKE_CURRENT_SOURCE_DIR}/proto/odc.proto ${CMAKE_CURRENT_SOURCE_DIR}/proto/o2control.proto) + set(EXAMPLES examples/15-ODC.cxx examples/16-AliECS.cxx) + foreach(PROTO_FILE example IN ZIP_LISTS PROTO_FILES EXAMPLES) + get_filename_component(PROTO_OUTPUT_NAME ${PROTO_FILE} NAME_WE) + get_filename_component(PROTO_FILE_PREFIX ${PROTO_FILE} PATH) + set(PROTO_CPP_OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${PROTO_OUTPUT_NAME}.pb.cc) + set(GRPC_CPP_OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${PROTO_OUTPUT_NAME}.grpc.pb.cc) + + add_custom_command( + OUTPUT "${PROTO_CPP_OUTPUT}" + COMMAND protobuf::protoc + ARGS --proto_path ${PROTO_FILE_PREFIX} + --cpp_out ${CMAKE_CURRENT_BINARY_DIR} + ${PROTO_OUTPUT_NAME}.proto + DEPENDS ${PROTO_FILE} + COMMENT "Running protoc on ${PROTO_FILE}" + VERBATIM) + + add_custom_command( + OUTPUT "${GRPC_CPP_OUTPUT}" + COMMAND protobuf::protoc + ARGS --proto_path ${PROTO_FILE_PREFIX} + --grpc_out=${CMAKE_CURRENT_BINARY_DIR} + --plugin=protoc-gen-grpc=$ + ${PROTO_OUTPUT_NAME}.proto + DEPENDS ${PROTO_FILE} + COMMENT "Running protoc/gRPC on ${PROTO_FILE}" + VERBATIM) - set(example examples/15-ODC.cxx) - get_filename_component(example_name ${example} NAME) - string(REGEX REPLACE ".cxx" "" example_name ${example_name}) - add_executable(${example_name} ${example} ${PROTO_CPP_OUTPUT} ${GRPC_CPP_OUTPUT}) - target_link_libraries(${example_name} PRIVATE - gRPC::grpc++ - protobuf::libprotobuf - Boost::program_options - ) - target_include_directories(${example_name} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + get_filename_component(example_name ${example} NAME) + string(REGEX REPLACE ".cxx" "" example_name ${example_name}) + add_executable(${example_name} ${example} ${PROTO_CPP_OUTPUT} ${GRPC_CPP_OUTPUT}) + target_link_libraries(${example_name} PRIVATE + Monitoring + gRPC::grpc++ + protobuf::libprotobuf + Boost::program_options + ) + target_include_directories(${example_name} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + endforeach() set_target_properties(15-ODC PROPERTIES OUTPUT_NAME "o2-monitoring-odc") + set_target_properties(16-AliECS PROPERTIES OUTPUT_NAME "o2-monitoring-aliecs-tasks") endif() #################################### diff --git a/examples/16-AliECS.cxx b/examples/16-AliECS.cxx new file mode 100644 index 00000000..b0de85de --- /dev/null +++ b/examples/16-AliECS.cxx @@ -0,0 +1,86 @@ +/// +/// \file 16-AliECS.cxx +/// \author Adam Wegrzynek +/// + +#include +#include +#include "o2control.grpc.pb.h" +#include "helpers/HttpConnection.h" +#include +#include +#include +#include "../src/Backends/InfluxDB.h" +#include "../src/Transports/HTTP.h" +#include "../src/MonLogger.h" + +using o2::monitoring::MonLogger; +using grpc::Channel; +using grpc::ClientContext; +using grpc::Status; + +using o2control::EnvironmentInfo; +using namespace o2::monitoring; + +class AliEcsClient { + public: + AliEcsClient(std::shared_ptr channel) : mStub(o2control::Control::NewStub(channel)) {} + void sendRunDetails(const auto& influxBackend) { + o2control::GetEnvironmentsRequest request; + request.set_showall(false); + request.set_showtaskinfos(false); + o2control::GetEnvironmentsReply reply; + ClientContext context; + Status status = mStub->GetEnvironments(&context, request, &reply); + if (status.ok()) { + MonLogger::Get() << "Status call OK" << MonLogger::End(); + for (int i = 0; i < reply.environments_size(); i++) { + if (reply.environments(i).currentrunnumber() > 1) { + MonLogger::Get() << "Env ID" << reply.environments(i).id() << MonLogger::End(); + auto metric = Metric{"tasks"}; + metric.addValue(reply.environments(i).numberofactivetasks(), "active").addValue(reply.environments(i).numberofinactivetasks(), "inactive"); + influxBackend->sendWithRun(metric, reply.environments(i).id(), std::to_string(reply.environments(i).currentrunnumber())); + } + } + } else { + std::cout << status.error_code() << ": " << status.error_message() << std::endl; + } + } + private: + std::unique_ptr mStub; +}; + + +int main(int argc, char* argv[]) { + boost::program_options::options_description desc("Program options"); + desc.add_options() + ("aliecs-host", boost::program_options::value()->required(), "AliECS hostname") + ("aliecs-port", boost::program_options::value()->required(), "AliECS port") + ("influxdb-url", boost::program_options::value()->required(), "InfluxDB hostname") + ("influxdb-token", boost::program_options::value()->required(), "InfluxDB token") + ("influxdb-org", boost::program_options::value()->default_value("cern"), "InfluxDB organisation") + ("influxdb-bucket", boost::program_options::value()->default_value("aliecs"), "InfluxDB bucket"); + boost::program_options::variables_map vm; + boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm); + boost::program_options::notify(vm); + MonLogger::mLoggerSeverity = o2::monitoring::Severity::Debug; + MonLogger::Get() << "Connected to AliECS server: " << vm["aliecs-host"].as() << ":" << vm["aliecs-port"].as() << MonLogger::End(); + grpc::ChannelArguments args; + args.SetMaxReceiveMessageSize(20*1024*1024); + AliEcsClient client(grpc::CreateCustomChannel( + vm["aliecs-host"].as() + ":" + std::to_string(vm["aliecs-port"].as()), + grpc::InsecureChannelCredentials(), + args + )); + auto httpTransport = std::make_unique( + vm["influxdb-url"].as() + "/api/v2/write?" + + "org=" + vm["influxdb-org"].as() + "&" + + "bucket=" + vm["influxdb-bucket"].as() + ); + httpTransport->addHeader("Authorization: Token " + vm["influxdb-token"].as()); + auto influxdbBackend = std::make_unique(std::move(httpTransport)); + for (;;) { + client.sendRunDetails(influxdbBackend); + std::this_thread::sleep_for(std::chrono::seconds(15)); + } +} diff --git a/proto/o2control.proto b/proto/o2control.proto new file mode 100644 index 00000000..b575da22 --- /dev/null +++ b/proto/o2control.proto @@ -0,0 +1,525 @@ +/* + * === This file is part of ALICE O² === + * + * Copyright 2018 CERN and copyright holders of ALICE O². + * Author: Teo Mrnjavac + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In applying this license CERN does not waive the privileges and + * immunities granted to it by virtue of its status as an + * Intergovernmental Organization or submit itself to any jurisdiction. + */ + +syntax = "proto3"; + +package o2control; +option java_package = "ch.cern.alice.o2.control.rpcserver"; +option go_package = "github.com/AliceO2Group/Control/core/protos;pb"; + +//////////////// Common event messages /////////////// + +message Event_MesosHeartbeat { +} + +message Ev_EnvironmentEvent { + string environmentId = 1; + string state = 2; + uint32 currentRunNumber = 3; + string error = 4; + string message = 5; +} + +message Ev_TaskEvent { + string name = 1; + string taskid = 2; + string state = 3; + string status = 4; + string hostname = 5; + string className = 6; +} + +message Ev_RoleEvent { + string name = 1; + string status = 2; + string state = 3; + string rolePath = 4; +} + +////////////////////////////////////////////////////// + +service Control { + rpc TrackStatus (StatusRequest) returns (stream StatusReply) {} + + rpc GetFrameworkInfo (GetFrameworkInfoRequest) returns (GetFrameworkInfoReply) {} + rpc Teardown (TeardownRequest) returns (TeardownReply) {} + + rpc GetEnvironments (GetEnvironmentsRequest) returns (GetEnvironmentsReply) {} + rpc NewAutoEnvironment (NewAutoEnvironmentRequest) returns (NewAutoEnvironmentReply) {} + rpc NewEnvironment (NewEnvironmentRequest) returns (NewEnvironmentReply) {} + rpc GetEnvironment (GetEnvironmentRequest) returns (GetEnvironmentReply) {} + rpc ControlEnvironment (ControlEnvironmentRequest) returns (ControlEnvironmentReply) {} + rpc ModifyEnvironment (ModifyEnvironmentRequest) returns (ModifyEnvironmentReply) {} + rpc DestroyEnvironment (DestroyEnvironmentRequest) returns (DestroyEnvironmentReply) {} + rpc GetActiveDetectors (Empty) returns (GetActiveDetectorsReply) {} + +// rpc SetEnvironmentProperties (SetEnvironmentPropertiesRequest) returns (SetEnvironmentPropertiesReply) {} +// rpc GetEnvironmentProperties (GetEnvironmentPropertiesRequest) returns (GetEnvironmentPropertiesReply) {} + + rpc GetTasks (GetTasksRequest) returns (GetTasksReply) {} + rpc GetTask(GetTaskRequest) returns (GetTaskReply) {} + rpc CleanupTasks(CleanupTasksRequest) returns (CleanupTasksReply) {} + + rpc GetRoles (GetRolesRequest) returns (GetRolesReply) {} + + rpc GetWorkflowTemplates (GetWorkflowTemplatesRequest) returns (GetWorkflowTemplatesReply) {} + + rpc ListRepos(ListReposRequest) returns (ListReposReply) {} + rpc AddRepo(AddRepoRequest) returns (AddRepoReply) {} + rpc RemoveRepo(RemoveRepoRequest) returns (RemoveRepoReply) {} + rpc RefreshRepos(RefreshReposRequest) returns (Empty) {} + rpc SetDefaultRepo(SetDefaultRepoRequest) returns (Empty) {} + rpc SetGlobalDefaultRevision(SetGlobalDefaultRevisionRequest) returns (Empty) {} + rpc SetRepoDefaultRevision(SetRepoDefaultRevisionRequest) returns (SetRepoDefaultRevisionReply) {} + rpc Subscribe(SubscribeRequest) returns (stream Event); + + rpc GetIntegratedServices(Empty) returns (ListIntegratedServicesReply) {} +} + +//////////////////////////////////////// +// Global status +//////////////////////////////////////// +message StatusRequest {} +message StatusReply { + string state = 1; + repeated StatusUpdate statusUpdates = 2; +} +message StatusUpdate { + enum Level { + DEBUG = 0; + INFO = 1; + WARNING = 2; + ERROR = 3; + } + Level level = 1; + oneof Event { + Event_MesosHeartbeat mesosHeartbeat = 2; + //TODO add other events here and in events.proto + } +} + +message Event { + string timestamp = 1; + oneof Payload { + Ev_EnvironmentEvent environmentEvent = 2; + Ev_TaskEvent taskEvent = 3; + Ev_RoleEvent roleEvent = 4; + } +} + +message SubscribeRequest{ + string id = 1; +} +//////////////////////////////////////// +// Framework +//////////////////////////////////////// +message GetFrameworkInfoRequest {} +message Version { + int32 major = 1; + int32 minor = 2; + int32 patch = 3; + string build = 4; + string productName = 5; + string versionStr = 6; +} +message GetFrameworkInfoReply { + string frameworkId = 1; + int32 environmentsCount = 2; + int32 tasksCount = 3; + string state = 4; + int32 hostsCount = 5; + string instanceName = 6; + Version version = 7; + string configurationEndpoint = 8; + repeated string detectorsInInstance = 9; + repeated string activeDetectors = 10; + repeated string availableDetectors = 11; +} + +// Not implemented yet +message TeardownRequest { + string reason = 1; +} +message TeardownReply {} + +//////////////////////////////////////// +// Environment +//////////////////////////////////////// +message GetEnvironmentsRequest { + bool showAll = 1; + bool showTaskInfos = 2; +} +message GetEnvironmentsReply { + string frameworkId = 1; + repeated EnvironmentInfo environments = 2; +} +message EnvironmentInfo { + string id = 1; + int64 createdWhen = 2; // msec + string state = 3; + repeated ShortTaskInfo tasks = 4; + string rootRole = 5; + uint32 currentRunNumber = 6; + map defaults = 7; + map vars = 8; + map userVars = 9; + int32 numberOfFlps = 10; + repeated string includedDetectors = 11; + string description = 12; + int32 numberOfHosts = 13; + map integratedServicesData = 14; + int32 numberOfTasks = 15; + string currentTransition = 16; + int32 numberOfActiveTasks = 17; + int32 numberOfInactiveTasks = 18; +} + +message NewEnvironmentRequest { + string workflowTemplate = 1; + map vars = 2; + bool public = 3; +} +message NewEnvironmentReply { + EnvironmentInfo environment = 1; + bool public = 2; +} +message NewAutoEnvironmentRequest { + string workflowTemplate = 1; + map vars = 2; + string id = 3; +} +message NewAutoEnvironmentReply { + +} + +message GetEnvironmentRequest { + string id = 1; + bool showWorkflowTree = 2; +} +message GetEnvironmentReply { + EnvironmentInfo environment = 1; + RoleInfo workflow = 2; + bool public = 3; +} + +message ControlEnvironmentRequest { + string id = 1; + enum Optype { + NOOP = 0; + START_ACTIVITY = 1; + STOP_ACTIVITY = 2; + CONFIGURE = 3; + RESET = 4; + GO_ERROR = 5; + DEPLOY = 6; + } + Optype type = 2; +} +message ControlEnvironmentReply { + string id = 1; + string state = 2; + uint32 currentRunNumber = 3; + // All times are in milliseconds + int64 startOfTransition = 4; + int64 endOfTransition = 5; + int64 transitionDuration = 6; +} + +message ModifyEnvironmentRequest { + string id = 1; + repeated EnvironmentOperation operations = 2; + bool reconfigureAll = 3; +} +message EnvironmentOperation { + enum Optype { + NOOP = 0; + REMOVE_ROLE = 3; + ADD_ROLE = 4; + } + Optype type = 1; + string roleName = 2; +} +message ModifyEnvironmentReply { + repeated EnvironmentOperation failedOperations = 1; + string id = 2; + string state = 3; +} + +message DestroyEnvironmentRequest { + string id = 1; + bool keepTasks = 2; + bool allowInRunningState = 3; + bool force = 4; +} +message DestroyEnvironmentReply { + CleanupTasksReply cleanupTasksReply = 1; +} + +message GetActiveDetectorsReply { + repeated string detectors = 1; +} + +//////////////////////////////////////// +// Environment, GET/SET properties +//////////////////////////////////////// +message SetEnvironmentPropertiesRequest { + string id = 1; + // If properties == nil, the core sets nothing + // and reply ok + map properties = 2; +} +message SetEnvironmentPropertiesReply {} + +message GetEnvironmentPropertiesRequest { + string id = 1; + // If len(queries) == 0, we return an + // empty map. + // To retrieve all KVs, use query '*' + repeated string queries = 2; + bool excludeGlobals = 3; +} +message GetEnvironmentPropertiesReply { + map properties = 1; +} + + +//////////////////////////////////////// +// Tasks +//////////////////////////////////////// +message ShortTaskInfo { + string name = 1; + bool locked = 2; + string taskId = 3; + string status = 4; + string state = 5; + string className = 6; + TaskDeploymentInfo deploymentInfo = 7; + string pid = 8; + string sandboxStdout = 9; + bool claimable = 10; +} +message TaskDeploymentInfo { + string hostname = 1; + string agentId = 2; + string offerId = 3; + string executorId = 4; +} + +message GetTasksRequest {} +message GetTasksReply { + repeated ShortTaskInfo tasks = 1; +} +message GetTaskRequest { + string taskId = 1; +} +message GetTaskReply { + TaskInfo task = 1; +} + +message TaskClassInfo { + string name = 1; + string controlMode = 2; +} +message CommandInfo { + repeated string env = 1; + bool shell = 2; + string value = 3; + repeated string arguments = 4; + string user = 5; +} +message ChannelInfo { + string name = 1; + string type = 2; + string target = 3; +} +message TaskInfo { + ShortTaskInfo shortInfo = 1; + TaskClassInfo classInfo = 2; + repeated ChannelInfo inboundChannels = 3; + repeated ChannelInfo outboundChannels = 4; + CommandInfo commandInfo = 5; + string taskPath = 6; + string envId = 7; + map properties = 9; +} + +message CleanupTasksRequest { + repeated string taskIds = 1; +} +message CleanupTasksReply { + repeated ShortTaskInfo killedTasks = 1; + repeated ShortTaskInfo runningTasks = 2; +} + +//////////////////////////////////////// +// Roles +//////////////////////////////////////// +message GetRolesRequest { + string envId = 1; + string pathSpec = 2; +} + +message RoleInfo { + string name = 1; + string status = 2; + string state = 3; + string fullPath = 4; + repeated string taskIds = 5; + repeated RoleInfo roles = 6; + map defaults = 7; + map vars = 8; + map userVars = 9; + map consolidatedStack = 10; + string description = 11; +} + +message GetRolesReply { + repeated RoleInfo roles = 1; +} + +message GetWorkflowTemplatesRequest{ + string repoPattern = 1; + string revisionPattern = 2; + bool allBranches = 3; + bool allTags = 4; + bool allWorkflows = 5; +} + +message VarSpecMessage { + enum UiWidget { + editBox = 0; // plain string input line, can accept types number (like a spinBox) and string + slider = 1; // input widget exclusively for numbers, range allowedValues[0]-[1] + listBox = 2; // displays a list of items, can accept types number, string or list; if number/string ==> single selection, otherwise multiple selection allowed + dropDownBox = 3; + comboBox = 4; + radioButtonBox = 5; + checkBox = 6; + } + + enum Type { + string = 0; + number = 1; + bool = 2; + list = 3; + map = 4; + } + + string defaultValue = 1; + Type type = 2; + string label = 3; + string description = 4; + UiWidget widget = 5; + string panel = 6; // hint for the UI on where to put or group the given variable input + repeated string allowedValues = 7; // list of offered values from which to choose (only for some UiWidgets) + int32 index = 8; + string visibleIf = 9; // JS expression that evaluates to bool + string enabledIf = 10; // JS expression that evaluates to bool +} + +message WorkflowTemplateInfo { + string repo = 1; + string template = 2; + string revision = 3; + map varSpecMap = 4; + string description = 5; +} + +message GetWorkflowTemplatesReply{ + repeated WorkflowTemplateInfo workflowTemplates = 1; +} + +//////////////////////////////////////// +// Repos +//////////////////////////////////////// + +message ListReposRequest { + bool getRevisions = 1; +} + +message RepoInfo { + string name = 1; + bool default = 2; + string defaultRevision = 3; + repeated string revisions = 4; +} + +message ListReposReply { + repeated RepoInfo repos = 1; + string globalDefaultRevision = 2; +} + +message AddRepoRequest { + string name = 1; + string defaultRevision = 2; +} + +message AddRepoReply { + string newDefaultRevision = 1; + string info = 2; +} + +message RemoveRepoRequest { + int32 index = 1; +} + +message RemoveRepoReply { + string newDefaultRepo = 1; +} + +message RefreshReposRequest { + int32 index = 1; +} + +message SetDefaultRepoRequest { + int32 index = 1; +} + +message SetGlobalDefaultRevisionRequest { + string revision = 1; +} + +message SetRepoDefaultRevisionRequest { + int32 index = 1; + string revision = 2; +} + +message SetRepoDefaultRevisionReply { + string info = 1; +} + +message Empty { + +} + +message ListIntegratedServicesReply { + map services = 1; // keys are IDs (e.g. "ddsched"), the service name should be displayed to users instead +} + +message IntegratedServiceInfo { + string name = 1; // user-visible service name, e.g. "DD scheduler" + bool enabled = 2; + string endpoint = 3; + string connectionState = 4; // allowed values: READY, CONNECTING, TRANSIENT_FAILURE, IDLE, SHUTDOWN + string data = 5; // always a JSON payload with a map inside. +} \ No newline at end of file From 2f917a2d4f84826f631cc8b210f9cae0e8c90128 Mon Sep 17 00:00:00 2001 From: Adam Wegrzynek Date: Fri, 6 Oct 2023 11:29:07 +0200 Subject: [PATCH 14/14] [OMON-661] Remove division by 0 log message (#319) --- CMakeLists.txt | 2 +- src/DerivedMetrics.cxx | 2 +- test/testDerived.cxx | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a583a92..1c0f3bd0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ endif() # Define project project(Monitoring - VERSION 3.17.3 + VERSION 3.17.4 DESCRIPTION "O2 Monitoring library" LANGUAGES CXX ) diff --git a/src/DerivedMetrics.cxx b/src/DerivedMetrics.cxx index bc9405bd..de633886 100644 --- a/src/DerivedMetrics.cxx +++ b/src/DerivedMetrics.cxx @@ -80,7 +80,7 @@ bool DerivedMetrics::process(Metric& metric, DerivedMetricMode mode) int timestampCount = timestampDifference.count(); // disallow dividing by 0 if (timestampCount == 0) { - throw MonitoringException("DerivedMetrics", "Division by 0 when calculating rate for: " + metric.getName() + "/" + metric.getFirstValue().first); + //throw MonitoringException("DerivedMetrics", "Division by 0 when calculating rate for: " + metric.getName() + "/" + metric.getFirstValue().first); } auto current = metric.getFirstValue().second; diff --git a/test/testDerived.cxx b/test/testDerived.cxx index 5e35578f..16037ce1 100644 --- a/test/testDerived.cxx +++ b/test/testDerived.cxx @@ -171,7 +171,7 @@ bool exceptionCheck(const MonitoringException& e) return false; } -BOOST_AUTO_TEST_CASE(divisionByZero) +/*BOOST_AUTO_TEST_CASE(divisionByZero) { std::string name("test"); o2::monitoring::DerivedMetrics derivedHandler; @@ -179,7 +179,7 @@ BOOST_AUTO_TEST_CASE(divisionByZero) derivedHandler.process(metric, DerivedMetricMode::RATE); BOOST_CHECK_EXCEPTION(derivedHandler.process(metric, DerivedMetricMode::RATE), MonitoringException, exceptionCheck); -} +}*/ BOOST_AUTO_TEST_CASE(derivedIncrementInt) {