From 9ecf021ec20eb393f769188d89f7abb05439e635 Mon Sep 17 00:00:00 2001 From: Costa Tsaousis Date: Mon, 9 Dec 2024 02:37:44 +0200 Subject: [PATCH] Streaming improvements #1 (#19137) * prefer tinysleep over yielding the processor * split spinlocks to separate files * rename spinlock initializers * Optimize ML queuing operations. - Allocate 25% of cores for ML. - Split queues by request type. - Accurate stats for queue operations by type. * abstracted circular buffer into a new private structure to enable using it in receiver sending side - no features added yet, only abstracted the existing functionality - not tested yet * completed the abstraction of stream circular buffer * unified list of receivers and senders; opcodes now support both receivers and senders * use strings in pluginsd * stream receivers send data back to the child using the event loop * do not share pgc aral between caches * pgc uses 4 to 256 partitions, by default equal to the number of CPU cores * add forgotten worker job * workers now monitor spinlock contention * stream sender tries to lock the sender, but does not wait for it - it will be handled later * increase the number of web server threads to the number of cpu cores, with a minimum of 6 * use the nowait versions of nd_sock functions * handle EAGAIN properly * add spinlock contention tracing for rw_spinlock * aral lock/unlock contention tracing * allocate the compressed buffer * use 128KiB for aral default page size; limit memory protection to 5GiB * aral uses mmap() for big pages * enrich log messages * renamed telemetry to pulse * unified sender and receiver socket event loops * logging improvements * NETDATA_LOG_STREAM_SENDER logs inbound and outbound traffic * 16k receiver buffer size to improve interactivity * fix NETDATA_LOG_STREAM_SENDER in sender_execute * do not stream ML models for charts and dimensions that have not been exposed * add support for sending QUIT to plugins and waiting for some time for them to quit gracefully * global spinlock contention per function * use an aral per pgc partition; use 8 partitions for PGD * rrdcalc: do not change the frequency of alerts - it uses arbitrary values used during replication, changing permanently the frequency of alerts replication: use 1/3 of the cores or 1 core every 10 nodes (min of the two) pgd: use as many aral partitions as the CPU cores, up to 256 * aral does 1 allocation per page (the structure and the elements together), instead of two * use the evitor thread only when we run out of memory; restore the optimization about prepending or appending clean pages based on their accesses; use the main cache free memory for the other caches, reducing I/O when the main cache has enough room * reduce the number of events per poll() to 10 * aral allocates pages of up to 1MiB; restore processing 100 events per nd_poll() call * drain the sockets while reading * receiver sockets should be non-blocking * add stability detector to aral * increase the receivers send buffer * do not remove the sender or the receiver while we drain the input sockets --------- Co-authored-by: vkalintiris --- CMakeLists.txt | 70 ++- src/aclk/aclk_query.c | 2 +- src/aclk/https_client.c | 2 +- src/claim/claim-with-api.c | 4 +- src/claim/claim_id.c | 2 +- .../cgroups.plugin/cgroup-network.c | 2 +- src/collectors/ebpf.plugin/ebpf_apps.c | 2 +- .../freeipmi.plugin/freeipmi_plugin.c | 4 +- .../proc.plugin/proc_net_dev_renames.c | 2 +- src/collectors/statsd.plugin/statsd.c | 4 +- .../systemd-journal-files.c | 2 +- src/collectors/tc.plugin/plugin_tc.c | 2 +- .../windows-events-providers.c | 2 +- .../windows-events-sources.c | 2 +- src/daemon/buildinfo.c | 2 +- src/daemon/common.h | 2 +- .../netdata-conf-backwards-compatibility.c | 20 +- src/daemon/config/netdata-conf-db.c | 5 +- src/daemon/main.c | 10 +- .../telemetry-aral.c => pulse/pulse-aral.c} | 22 +- src/daemon/pulse/pulse-aral.h | 16 + .../pulse-daemon-memory.c} | 15 +- .../pulse-daemon-memory.h} | 10 +- .../pulse-daemon.c} | 20 +- src/daemon/pulse/pulse-daemon.h | 12 + .../pulse-dbengine.c} | 78 +-- src/daemon/pulse/pulse-dbengine.h | 17 + .../pulse-dictionary.c} | 18 +- .../pulse-dictionary.h} | 10 +- .../pulse-gorilla.c} | 14 +- src/daemon/pulse/pulse-gorilla.h | 15 + .../pulse-heartbeat.c} | 8 +- src/daemon/pulse/pulse-heartbeat.h | 12 + .../pulse-http-api.c} | 28 +- .../pulse-http-api.h} | 16 +- .../pulse-ingestion.c} | 14 +- src/daemon/pulse/pulse-ingestion.h | 14 + .../telemetry-ml.c => pulse/pulse-ml.c} | 20 +- src/daemon/pulse/pulse-ml.h | 33 + .../pulse-queries.c} | 25 +- src/daemon/pulse/pulse-queries.h | 17 + .../pulse-sqlite3.c} | 20 +- src/daemon/pulse/pulse-sqlite3.h | 15 + .../pulse-string.c} | 12 +- src/daemon/pulse/pulse-string.h | 12 + .../pulse-trace-allocations.c} | 14 +- src/daemon/pulse/pulse-trace-allocations.h | 14 + .../pulse-workers.c} | 295 ++++++++- src/daemon/pulse/pulse-workers.h | 13 + .../{telemetry/telemetry.c => pulse/pulse.c} | 104 ++-- src/daemon/pulse/pulse.h | 30 + src/daemon/static_threads.c | 20 +- src/daemon/telemetry/telemetry-aral.h | 16 - src/daemon/telemetry/telemetry-daemon.h | 12 - src/daemon/telemetry/telemetry-dbengine.h | 17 - src/daemon/telemetry/telemetry-gorilla.h | 15 - src/daemon/telemetry/telemetry-heartbeat.h | 12 - src/daemon/telemetry/telemetry-ingestion.h | 14 - src/daemon/telemetry/telemetry-ml.h | 33 - src/daemon/telemetry/telemetry-queries.h | 17 - src/daemon/telemetry/telemetry-sqlite3.h | 15 - src/daemon/telemetry/telemetry-string.h | 12 - .../telemetry/telemetry-trace-allocations.h | 14 - src/daemon/telemetry/telemetry-workers.h | 13 - src/daemon/telemetry/telemetry.h | 30 - src/database/contexts/query_target.c | 4 +- src/database/engine/cache.c | 137 ++--- src/database/engine/datafile.c | 5 +- src/database/engine/datafile.h | 3 +- src/database/engine/metric.c | 4 +- src/database/engine/page.c | 49 +- src/database/engine/pagecache.c | 43 +- src/database/engine/pdc.c | 10 +- src/database/engine/rrdengine.c | 16 +- src/database/engine/rrdengineapi.c | 2 +- src/database/rrdlabels.c | 3 +- src/database/rrdset.c | 2 +- src/database/sqlite/sqlite_functions.c | 10 +- src/exporting/process_data.c | 2 +- src/exporting/send_internal_metrics.c | 6 +- src/health/health.c | 2 +- src/health/rrdcalc.c | 21 +- src/libnetdata/aral/aral.c | 364 +++++++---- src/libnetdata/aral/aral.h | 2 +- src/libnetdata/avl/avl.h | 2 +- src/libnetdata/clocks/clocks.c | 2 +- src/libnetdata/config/appconfig.h | 4 +- src/libnetdata/config/appconfig_conf_file.c | 2 +- .../functions_evloop/functions_evloop.c | 11 +- .../functions_evloop/functions_evloop.h | 2 + src/libnetdata/july/july.c | 2 +- src/libnetdata/libnetdata.c | 8 +- src/libnetdata/libnetdata.h | 2 + src/libnetdata/local-sockets/local-sockets.h | 2 +- src/libnetdata/locks/locks.c | 250 -------- src/libnetdata/locks/locks.h | 48 -- src/libnetdata/locks/rw-spinlock.c | 103 ++++ src/libnetdata/locks/rw-spinlock.h | 34 ++ src/libnetdata/locks/spinlock.c | 63 ++ src/libnetdata/locks/spinlock.h | 48 ++ src/libnetdata/log/nd_log-internals.c | 18 +- src/libnetdata/log/nd_log-to-windows-events.c | 2 +- src/libnetdata/os/random.c | 2 +- .../system-maps/cache-host-users-and-groups.c | 4 +- .../os/system-maps/cached-gid-groupname.c | 2 +- .../os/system-maps/cached-sid-username.c | 2 +- .../os/system-maps/cached-uid-username.c | 2 +- .../os/windows-perflib/perflib-names.c | 2 +- src/libnetdata/query_progress/progress.c | 2 +- src/libnetdata/socket/nd-poll.c | 14 +- src/libnetdata/socket/nd-sock.h | 1 - src/libnetdata/socket/security.c | 2 +- src/libnetdata/spawn_server/spawn-tester.c | 4 +- src/libnetdata/spawn_server/spawn_popen.c | 6 +- src/libnetdata/spawn_server/spawn_popen.h | 2 +- src/libnetdata/spawn_server/spawn_server.h | 2 +- .../spawn_server/spawn_server_libuv.c | 2 +- .../spawn_server/spawn_server_nofork.c | 15 +- .../spawn_server/spawn_server_posix.c | 6 +- .../spawn_server/spawn_server_windows.c | 42 +- src/libnetdata/threads/threads.c | 4 +- .../worker_utilization/worker_utilization.c | 116 +++- .../worker_utilization/worker_utilization.h | 37 +- src/ml/ad_charts.cc | 91 ++- src/ml/ml.cc | 102 ++-- src/ml/ml_memory.cc | 10 +- src/ml/ml_queue.cc | 56 +- src/ml/ml_queue.h | 31 +- src/ml/ml_worker.h | 9 +- src/plugins.d/plugins_d.c | 70 ++- src/plugins.d/plugins_d.h | 11 +- src/plugins.d/pluginsd_functions.c | 6 +- src/plugins.d/pluginsd_internals.c | 5 +- src/plugins.d/pluginsd_internals.h | 19 +- src/plugins.d/pluginsd_parser.c | 12 +- src/plugins.d/pluginsd_parser.h | 4 + src/plugins.d/pluginsd_replication.c | 1 + src/streaming/protocol/command-nodeid.c | 4 +- src/streaming/replication.c | 33 +- src/streaming/replication.h | 3 +- src/streaming/rrdhost-status.c | 12 +- src/streaming/stream-circular-buffer.c | 126 ++++ src/streaming/stream-circular-buffer.h | 83 +++ src/streaming/stream-connector.c | 9 +- src/streaming/stream-handshake.c | 6 +- src/streaming/stream-handshake.h | 9 +- src/streaming/stream-parents.c | 4 +- src/streaming/stream-path.c | 2 +- src/streaming/stream-receiver-connection.c | 8 +- src/streaming/stream-receiver-internals.h | 13 +- src/streaming/stream-receiver.c | 576 ++++++++++++------ src/streaming/stream-sender-api.c | 9 +- src/streaming/stream-sender-commit.c | 82 +-- src/streaming/stream-sender-execute.c | 34 ++ src/streaming/stream-sender-internals.h | 56 +- src/streaming/stream-sender.c | 337 +++++----- src/streaming/stream-thread.c | 172 ++++-- src/streaming/stream-thread.h | 19 +- src/streaming/stream.h | 1 - src/web/api/queries/query.c | 4 +- src/web/api/v3/api_v3_settings.c | 2 +- src/web/rtc/webrtc.c | 2 +- src/web/server/static/static-threaded.c | 7 +- src/web/server/web_client.c | 2 +- src/web/server/web_client_cache.c | 6 +- 165 files changed, 3010 insertions(+), 1939 deletions(-) rename src/daemon/{telemetry/telemetry-aral.c => pulse/pulse-aral.c} (90%) create mode 100644 src/daemon/pulse/pulse-aral.h rename src/daemon/{telemetry/telemetry-daemon-memory.c => pulse/pulse-daemon-memory.c} (96%) rename src/daemon/{telemetry/telemetry-daemon-memory.h => pulse/pulse-daemon-memory.h} (73%) rename src/daemon/{telemetry/telemetry-daemon.c => pulse/pulse-daemon.c} (83%) create mode 100644 src/daemon/pulse/pulse-daemon.h rename src/daemon/{telemetry/telemetry-dbengine.c => pulse/pulse-dbengine.c} (98%) create mode 100644 src/daemon/pulse/pulse-dbengine.h rename src/daemon/{telemetry/telemetry-dictionary.c => pulse/pulse-dictionary.c} (98%) rename src/daemon/{telemetry/telemetry-dictionary.h => pulse/pulse-dictionary.h} (78%) rename src/daemon/{telemetry/telemetry-gorilla.c => pulse/pulse-gorilla.c} (93%) create mode 100644 src/daemon/pulse/pulse-gorilla.h rename src/daemon/{telemetry/telemetry-heartbeat.c => pulse/pulse-heartbeat.c} (91%) create mode 100644 src/daemon/pulse/pulse-heartbeat.h rename src/daemon/{telemetry/telemetry-http-api.c => pulse/pulse-http-api.c} (94%) rename src/daemon/{telemetry/telemetry-http-api.h => pulse/pulse-http-api.h} (50%) rename src/daemon/{telemetry/telemetry-ingestion.c => pulse/pulse-ingestion.c} (83%) create mode 100644 src/daemon/pulse/pulse-ingestion.h rename src/daemon/{telemetry/telemetry-ml.c => pulse/pulse-ml.c} (87%) create mode 100644 src/daemon/pulse/pulse-ml.h rename src/daemon/{telemetry/telemetry-queries.c => pulse/pulse-queries.c} (95%) create mode 100644 src/daemon/pulse/pulse-queries.h rename src/daemon/{telemetry/telemetry-sqlite3.c => pulse/pulse-sqlite3.c} (97%) create mode 100644 src/daemon/pulse/pulse-sqlite3.h rename src/daemon/{telemetry/telemetry-string.c => pulse/pulse-string.c} (95%) create mode 100644 src/daemon/pulse/pulse-string.h rename src/daemon/{telemetry/telemetry-trace-allocations.c => pulse/pulse-trace-allocations.c} (95%) create mode 100644 src/daemon/pulse/pulse-trace-allocations.h rename src/daemon/{telemetry/telemetry-workers.c => pulse/pulse-workers.c} (77%) create mode 100644 src/daemon/pulse/pulse-workers.h rename src/daemon/{telemetry/telemetry.c => pulse/pulse.c} (56%) create mode 100644 src/daemon/pulse/pulse.h delete mode 100644 src/daemon/telemetry/telemetry-aral.h delete mode 100644 src/daemon/telemetry/telemetry-daemon.h delete mode 100644 src/daemon/telemetry/telemetry-dbengine.h delete mode 100644 src/daemon/telemetry/telemetry-gorilla.h delete mode 100644 src/daemon/telemetry/telemetry-heartbeat.h delete mode 100644 src/daemon/telemetry/telemetry-ingestion.h delete mode 100644 src/daemon/telemetry/telemetry-ml.h delete mode 100644 src/daemon/telemetry/telemetry-queries.h delete mode 100644 src/daemon/telemetry/telemetry-sqlite3.h delete mode 100644 src/daemon/telemetry/telemetry-string.h delete mode 100644 src/daemon/telemetry/telemetry-trace-allocations.h delete mode 100644 src/daemon/telemetry/telemetry-workers.h delete mode 100644 src/daemon/telemetry/telemetry.h create mode 100644 src/libnetdata/locks/rw-spinlock.c create mode 100644 src/libnetdata/locks/rw-spinlock.h create mode 100644 src/libnetdata/locks/spinlock.c create mode 100644 src/libnetdata/locks/spinlock.h create mode 100644 src/streaming/stream-circular-buffer.c create mode 100644 src/streaming/stream-circular-buffer.h diff --git a/CMakeLists.txt b/CMakeLists.txt index ebe892d54dbf1d..f71c1693d3aa28 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -944,6 +944,10 @@ set(LIBNETDATA_FILES src/libnetdata/os/system_memory.h src/libnetdata/socket/nd-poll.c src/libnetdata/socket/nd-poll.h + src/libnetdata/locks/spinlock.c + src/libnetdata/locks/spinlock.h + src/libnetdata/locks/rw-spinlock.c + src/libnetdata/locks/rw-spinlock.h ) set(LIBH2O_FILES @@ -1037,8 +1041,8 @@ set(DAEMON_FILES src/daemon/daemon.h src/daemon/libuv_workers.c src/daemon/libuv_workers.h - src/daemon/telemetry/telemetry.c - src/daemon/telemetry/telemetry.h + src/daemon/pulse/pulse.c + src/daemon/pulse/pulse.h src/daemon/analytics.c src/daemon/analytics.h src/daemon/main.c @@ -1068,36 +1072,36 @@ set(DAEMON_FILES src/daemon/dyncfg/dyncfg-internals.h src/daemon/dyncfg/dyncfg-intercept.c src/daemon/dyncfg/dyncfg-tree.c - src/daemon/telemetry/telemetry-http-api.c - src/daemon/telemetry/telemetry-http-api.h - src/daemon/telemetry/telemetry-queries.c - src/daemon/telemetry/telemetry-queries.h - src/daemon/telemetry/telemetry-ingestion.c - src/daemon/telemetry/telemetry-ingestion.h - src/daemon/telemetry/telemetry-ml.c - src/daemon/telemetry/telemetry-ml.h - src/daemon/telemetry/telemetry-gorilla.c - src/daemon/telemetry/telemetry-gorilla.h - src/daemon/telemetry/telemetry-daemon.c - src/daemon/telemetry/telemetry-daemon.h - src/daemon/telemetry/telemetry-daemon-memory.c - src/daemon/telemetry/telemetry-daemon-memory.h - src/daemon/telemetry/telemetry-sqlite3.c - src/daemon/telemetry/telemetry-sqlite3.h - src/daemon/telemetry/telemetry-dbengine.c - src/daemon/telemetry/telemetry-dbengine.h - src/daemon/telemetry/telemetry-string.c - src/daemon/telemetry/telemetry-string.h - src/daemon/telemetry/telemetry-heartbeat.c - src/daemon/telemetry/telemetry-heartbeat.h - src/daemon/telemetry/telemetry-dictionary.c - src/daemon/telemetry/telemetry-dictionary.h - src/daemon/telemetry/telemetry-workers.c - src/daemon/telemetry/telemetry-workers.h - src/daemon/telemetry/telemetry-trace-allocations.c - src/daemon/telemetry/telemetry-trace-allocations.h - src/daemon/telemetry/telemetry-aral.c - src/daemon/telemetry/telemetry-aral.h + src/daemon/pulse/pulse-http-api.c + src/daemon/pulse/pulse-http-api.h + src/daemon/pulse/pulse-queries.c + src/daemon/pulse/pulse-queries.h + src/daemon/pulse/pulse-ingestion.c + src/daemon/pulse/pulse-ingestion.h + src/daemon/pulse/pulse-ml.c + src/daemon/pulse/pulse-ml.h + src/daemon/pulse/pulse-gorilla.c + src/daemon/pulse/pulse-gorilla.h + src/daemon/pulse/pulse-daemon.c + src/daemon/pulse/pulse-daemon.h + src/daemon/pulse/pulse-daemon-memory.c + src/daemon/pulse/pulse-daemon-memory.h + src/daemon/pulse/pulse-sqlite3.c + src/daemon/pulse/pulse-sqlite3.h + src/daemon/pulse/pulse-dbengine.c + src/daemon/pulse/pulse-dbengine.h + src/daemon/pulse/pulse-string.c + src/daemon/pulse/pulse-string.h + src/daemon/pulse/pulse-heartbeat.c + src/daemon/pulse/pulse-heartbeat.h + src/daemon/pulse/pulse-dictionary.c + src/daemon/pulse/pulse-dictionary.h + src/daemon/pulse/pulse-workers.c + src/daemon/pulse/pulse-workers.h + src/daemon/pulse/pulse-trace-allocations.c + src/daemon/pulse/pulse-trace-allocations.h + src/daemon/pulse/pulse-aral.c + src/daemon/pulse/pulse-aral.h src/daemon/config/netdata-conf-db.c src/daemon/config/netdata-conf-db.h src/daemon/config/netdata-conf.h @@ -1543,6 +1547,8 @@ set(STREAMING_PLUGIN_FILES src/streaming/stream-receiver-connection.c src/streaming/stream-sender-commit.h src/streaming/stream-traffic-types.h + src/streaming/stream-circular-buffer.c + src/streaming/stream-circular-buffer.h ) set(WEB_PLUGIN_FILES diff --git a/src/aclk/aclk_query.c b/src/aclk/aclk_query.c index 1d93a5e2de2c66..28c06435428c2d 100644 --- a/src/aclk/aclk_query.c +++ b/src/aclk/aclk_query.c @@ -16,7 +16,7 @@ struct pending_req_list { }; static struct pending_req_list *pending_req_list_head = NULL; -static SPINLOCK pending_req_list_lock = NETDATA_SPINLOCK_INITIALIZER; +static SPINLOCK pending_req_list_lock = SPINLOCK_INITIALIZER; void aclk_config_get_query_scope(void) { const char *s = config_get(CONFIG_SECTION_CLOUD, "scope", "full"); diff --git a/src/aclk/https_client.c b/src/aclk/https_client.c index f04683be0ce985..9703dd7e211942 100644 --- a/src/aclk/https_client.c +++ b/src/aclk/https_client.c @@ -6,7 +6,7 @@ #include "aclk_util.h" -#include "daemon/telemetry/telemetry.h" +#include "daemon/pulse/pulse.h" static const char *http_req_type_to_str(http_req_type_t req) { switch (req) { diff --git a/src/claim/claim-with-api.c b/src/claim/claim-with-api.c index cb02f94582c743..4c5179fc59a1e8 100644 --- a/src/claim/claim-with-api.c +++ b/src/claim/claim-with-api.c @@ -361,7 +361,7 @@ static bool send_curl_request(const char *machine_guid, const char *hostname, co } bool claim_agent(const char *url, const char *token, const char *rooms, const char *proxy, bool insecure) { - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; spinlock_lock(&spinlock); if (!check_and_generate_certificates()) { @@ -411,7 +411,7 @@ bool claim_agent_from_environment(void) { bool claim_agent_from_claim_conf(void) { static struct config claim_config = APPCONFIG_INITIALIZER; - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; bool ret = false; spinlock_lock(&spinlock); diff --git a/src/claim/claim_id.c b/src/claim/claim_id.c index dd79eb640d5f2a..07f2d975480004 100644 --- a/src/claim/claim_id.c +++ b/src/claim/claim_id.c @@ -7,7 +7,7 @@ static struct { ND_UUID claim_uuid; ND_UUID claim_uuid_saved; } claim = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, }; void claim_id_clear_previous_working(void) { diff --git a/src/collectors/cgroups.plugin/cgroup-network.c b/src/collectors/cgroups.plugin/cgroup-network.c index d64b31288ee577..cc14dd9d643a36 100644 --- a/src/collectors/cgroups.plugin/cgroup-network.c +++ b/src/collectors/cgroups.plugin/cgroup-network.c @@ -554,7 +554,7 @@ static void read_from_spawned(SPAWN_INSTANCE *si, const char *name __maybe_unuse } fclose(fp); spawn_server_instance_read_fd_unset(si); - spawn_server_exec_kill(spawn_server, si); + spawn_server_exec_kill(spawn_server, si, 0); } void detect_veth_interfaces_spawn(pid_t pid) { diff --git a/src/collectors/ebpf.plugin/ebpf_apps.c b/src/collectors/ebpf.plugin/ebpf_apps.c index 548c14bd78c447..09a6c511457dfe 100644 --- a/src/collectors/ebpf.plugin/ebpf_apps.c +++ b/src/collectors/ebpf.plugin/ebpf_apps.c @@ -307,7 +307,7 @@ int ebpf_read_apps_groups_conf(struct ebpf_target **agdt, struct ebpf_target **a #define MAX_CMDLINE 16384 Pvoid_t ebpf_pid_judyL = NULL; -SPINLOCK ebpf_pid_spinlock = NETDATA_SPINLOCK_INITIALIZER; +SPINLOCK ebpf_pid_spinlock = SPINLOCK_INITIALIZER; void ebpf_pid_del(pid_t pid) { diff --git a/src/collectors/freeipmi.plugin/freeipmi_plugin.c b/src/collectors/freeipmi.plugin/freeipmi_plugin.c index a0eb0783d87c13..5ca58432071b73 100644 --- a/src/collectors/freeipmi.plugin/freeipmi_plugin.c +++ b/src/collectors/freeipmi.plugin/freeipmi_plugin.c @@ -1959,7 +1959,7 @@ int main (int argc, char **argv) { struct ipmi_collection_thread sensors_data = { .type = IPMI_COLLECT_TYPE_SENSORS, .freq_s = update_every, - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .debug = debug, .state = { .debug = debug, @@ -1974,7 +1974,7 @@ int main (int argc, char **argv) { }, sel_data = { .type = IPMI_COLLECT_TYPE_SEL, .freq_s = update_every_sel, - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .debug = debug, .state = { .debug = debug, diff --git a/src/collectors/proc.plugin/proc_net_dev_renames.c b/src/collectors/proc.plugin/proc_net_dev_renames.c index fb50ce66c6c7b3..e063ccef0055e9 100644 --- a/src/collectors/proc.plugin/proc_net_dev_renames.c +++ b/src/collectors/proc.plugin/proc_net_dev_renames.c @@ -15,7 +15,7 @@ static void dictionary_netdev_rename_delete_cb(const DICTIONARY_ITEM *item __may } void netdev_renames_init(void) { - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; spinlock_lock(&spinlock); if(!netdev_renames) { diff --git a/src/collectors/statsd.plugin/statsd.c b/src/collectors/statsd.plugin/statsd.c index 1d48f0c9127165..cf344fe96bfff8 100644 --- a/src/collectors/statsd.plugin/statsd.c +++ b/src/collectors/statsd.plugin/statsd.c @@ -2654,7 +2654,7 @@ void *statsd_main(void *ptr) { RRDSET *st_pcharts = NULL; RRDDIM *rd_pcharts = NULL; - if(telemetry_enabled) { + if(pulse_enabled) { st_metrics = rrdset_create_localhost( "netdata", "statsd_metrics", @@ -2851,7 +2851,7 @@ void *statsd_main(void *ptr) { if(unlikely(!service_running(SERVICE_COLLECTORS))) break; - if(telemetry_enabled) { + if(pulse_enabled) { rrddim_set_by_pointer(st_metrics, rd_metrics_gauge, (collected_number)statsd.gauges.metrics); rrddim_set_by_pointer(st_metrics, rd_metrics_counter, (collected_number)statsd.counters.metrics); rrddim_set_by_pointer(st_metrics, rd_metrics_timer, (collected_number)statsd.timers.metrics); diff --git a/src/collectors/systemd-journal.plugin/systemd-journal-files.c b/src/collectors/systemd-journal.plugin/systemd-journal-files.c index ea0511f7a42e2a..6bd0fca59cf77b 100644 --- a/src/collectors/systemd-journal.plugin/systemd-journal-files.c +++ b/src/collectors/systemd-journal.plugin/systemd-journal-files.c @@ -704,7 +704,7 @@ int filenames_compar(const void *a, const void *b) { } void journal_files_registry_update(void) { - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; if(spinlock_trylock(&spinlock)) { usec_t scan_monotonic_ut = now_monotonic_usec(); diff --git a/src/collectors/tc.plugin/plugin_tc.c b/src/collectors/tc.plugin/plugin_tc.c index 7102e216d55229..5dc058a2086873 100644 --- a/src/collectors/tc.plugin/plugin_tc.c +++ b/src/collectors/tc.plugin/plugin_tc.c @@ -1135,7 +1135,7 @@ void *tc_main(void *ptr) { } // fgets() failed or loop broke - int code = spawn_popen_kill(tc_child_instance); + int code = spawn_popen_kill(tc_child_instance, 0); tc_child_instance = NULL; if(unlikely(device)) { diff --git a/src/collectors/windows-events.plugin/windows-events-providers.c b/src/collectors/windows-events.plugin/windows-events-providers.c index d4c4d35ea15de2..fe2791840da885 100644 --- a/src/collectors/windows-events.plugin/windows-events-providers.c +++ b/src/collectors/windows-events.plugin/windows-events-providers.c @@ -75,7 +75,7 @@ static struct { ARAL *aral_providers; ARAL *aral_handles; } pbc = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, }; static void provider_load_list(PROVIDER_META_HANDLE *h, WEVT_VARIANT *content, WEVT_VARIANT *property, diff --git a/src/collectors/windows-events.plugin/windows-events-sources.c b/src/collectors/windows-events.plugin/windows-events-sources.c index b931ed059f451b..767ec44189b1dd 100644 --- a/src/collectors/windows-events.plugin/windows-events-sources.c +++ b/src/collectors/windows-events.plugin/windows-events-sources.c @@ -484,7 +484,7 @@ WEVT_SOURCE_TYPE categorize_channel(const wchar_t *channel_path, const char **pr } void wevt_sources_scan(void) { - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; LPWSTR channel = NULL; EVT_HANDLE hChannelEnum = NULL; diff --git a/src/daemon/buildinfo.c b/src/daemon/buildinfo.c index 82265ef9d7e9ff..4866c182bcbf84 100644 --- a/src/daemon/buildinfo.c +++ b/src/daemon/buildinfo.c @@ -1263,7 +1263,7 @@ __attribute__((constructor)) void initialize_build_info(void) { int get_system_info(struct rrdhost_system_info *system_info); static void populate_system_info(void) { static bool populated = false; - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; if(populated) return; diff --git a/src/daemon/common.h b/src/daemon/common.h index 0fde3f0384ce36..29cf8e40247cce 100644 --- a/src/daemon/common.h +++ b/src/daemon/common.h @@ -19,7 +19,7 @@ extern "C" { #include "daemon/config/netdata-conf.h" #include "daemon/dyncfg/dyncfg.h" -#include "daemon/telemetry/telemetry.h" +#include "daemon/pulse/pulse.h" // health monitoring and alarm notifications #include "health/health.h" diff --git a/src/daemon/config/netdata-conf-backwards-compatibility.c b/src/daemon/config/netdata-conf-backwards-compatibility.c index b1dbb35954bd70..ce7698f26c33c3 100644 --- a/src/daemon/config/netdata-conf-backwards-compatibility.c +++ b/src/daemon/config/netdata-conf-backwards-compatibility.c @@ -181,14 +181,26 @@ void netdata_conf_backwards_compatibility(void) { config_move(CONFIG_SECTION_GLOBAL, "enable zero metrics", CONFIG_SECTION_DB, "enable zero metrics"); - config_move("global statistics", "update every", - CONFIG_SECTION_TELEMETRY, "update every"); + // ---------------------------------------------------------------------------------------------------------------- + // global statistics -> telemetry -> pulse config_move(CONFIG_SECTION_PLUGINS, "netdata monitoring", - CONFIG_SECTION_PLUGINS, "netdata telemetry"); + CONFIG_SECTION_PLUGINS, "netdata pulse"); + + config_move(CONFIG_SECTION_PLUGINS, "netdata telemetry", + CONFIG_SECTION_PLUGINS, "netdata pulse"); config_move(CONFIG_SECTION_PLUGINS, "netdata monitoring extended", - CONFIG_SECTION_TELEMETRY, "extended telemetry"); + CONFIG_SECTION_PULSE, "extended"); + + config_move("telemetry", "extended telemetry", + CONFIG_SECTION_PULSE, "extended"); + + config_move("global statistics", "update every", + CONFIG_SECTION_PULSE, "update every"); + + config_move("telemetry", "update every", + CONFIG_SECTION_PULSE, "update every"); // ---------------------------------------------------------------------------------------------------------------- diff --git a/src/daemon/config/netdata-conf-db.c b/src/daemon/config/netdata-conf-db.c index d2b9c83b89d3ae..19f0229f97a948 100644 --- a/src/daemon/config/netdata-conf-db.c +++ b/src/daemon/config/netdata-conf-db.c @@ -145,8 +145,11 @@ void netdata_conf_dbengine_init(const char *hostname) { OS_SYSTEM_MEMORY sm = os_system_memory(true); if(sm.ram_total_bytes && sm.ram_available_bytes && sm.ram_total_bytes > sm.ram_available_bytes) { // calculate the default out of memory protection size + uint64_t keep_free = sm.ram_total_bytes / 10; + if(keep_free > 5ULL * 1024 * 1024 * 1024) + keep_free = 5ULL * 1024 * 1024 * 1024; char buf[64]; - size_snprintf(buf, sizeof(buf), sm.ram_total_bytes / 10, "B", false); + size_snprintf(buf, sizeof(buf), keep_free, "B", false); size_parse(buf, &dbengine_out_of_memory_protection, "B"); } diff --git a/src/daemon/main.c b/src/daemon/main.c index 65a99afabf2167..7b75cc9a11e706 100644 --- a/src/daemon/main.c +++ b/src/daemon/main.c @@ -1424,14 +1424,14 @@ int netdata_main(int argc, char **argv) { default_stacksize = 1 * 1024 * 1024; #ifdef NETDATA_INTERNAL_CHECKS - telemetry_enabled = true; - telemetry_extended_enabled = true; + pulse_enabled = true; + pulse_extended_enabled = true; #endif - telemetry_extended_enabled = - config_get_boolean(CONFIG_SECTION_TELEMETRY, "extended telemetry", telemetry_extended_enabled); + pulse_extended_enabled = + config_get_boolean(CONFIG_SECTION_PULSE, "extended", pulse_extended_enabled); - if(telemetry_extended_enabled) + if(pulse_extended_enabled) // this has to run before starting any other threads that use workers workers_utilization_enable(); diff --git a/src/daemon/telemetry/telemetry-aral.c b/src/daemon/pulse/pulse-aral.c similarity index 90% rename from src/daemon/telemetry/telemetry-aral.c rename to src/daemon/pulse/pulse-aral.c index b5b90d0bf99775..25514c1ce6e22f 100644 --- a/src/daemon/telemetry/telemetry-aral.c +++ b/src/daemon/pulse/pulse-aral.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#define TELEMETRY_INTERNALS 1 -#include "telemetry-aral.h" +#define PULSE_INTERNALS 1 +#include "pulse-aral.h" struct aral_info { const char *name; @@ -19,7 +19,7 @@ static struct { ARAL_STATS_JudyLSet idx; } globals = { 0 }; -static void telemetry_aral_register_statistics(struct aral_statistics *stats, const char *name) { +static void pulse_aral_register_statistics(struct aral_statistics *stats, const char *name) { if(!name || !stats) return; @@ -33,7 +33,7 @@ static void telemetry_aral_register_statistics(struct aral_statistics *stats, co spinlock_unlock(&globals.spinlock); } -void telemetry_aral_register(ARAL *ar, const char *name) { +void pulse_aral_register(ARAL *ar, const char *name) { if(!ar) return; if(!name) @@ -41,10 +41,10 @@ void telemetry_aral_register(ARAL *ar, const char *name) { struct aral_statistics *stats = aral_get_statistics(ar); - telemetry_aral_register_statistics(stats, name); + pulse_aral_register_statistics(stats, name); } -void telemetry_aral_unregister(ARAL *ar) { +void pulse_aral_unregister(ARAL *ar) { if(!ar) return; struct aral_statistics *stats = aral_get_statistics(ar); @@ -58,11 +58,11 @@ void telemetry_aral_unregister(ARAL *ar) { spinlock_unlock(&globals.spinlock); } -void telemerty_aral_init(void) { - telemetry_aral_register_statistics(aral_by_size_statistics(), "by-size"); +void pulse_aral_init(void) { + pulse_aral_register_statistics(aral_by_size_statistics(), "by-size"); } -void telemetry_aral_do(bool extended) { +void pulse_aral_do(bool extended) { if(!extended) return; spinlock_lock(&globals.spinlock); @@ -111,7 +111,7 @@ void telemetry_aral_do(bool extended) { "Array Allocator Memory Utilization", "bytes", "netdata", - "telemetry", + "pulse", 910000, localhost->rrd_update_every, RRDSET_TYPE_STACKED); @@ -145,7 +145,7 @@ void telemetry_aral_do(bool extended) { "Array Allocator Memory Utilization", "%", "netdata", - "telemetry", + "pulse", 910001, localhost->rrd_update_every, RRDSET_TYPE_LINE); diff --git a/src/daemon/pulse/pulse-aral.h b/src/daemon/pulse/pulse-aral.h new file mode 100644 index 00000000000000..de254f65079828 --- /dev/null +++ b/src/daemon/pulse/pulse-aral.h @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_PULSE_ARAL_H +#define NETDATA_PULSE_ARAL_H + +#include "daemon/common.h" + +void pulse_aral_register(ARAL *ar, const char *name); +void pulse_aral_unregister(ARAL *ar); + +#if defined(PULSE_INTERNALS) +void pulse_aral_init(void); +void pulse_aral_do(bool extended); +#endif + +#endif //NETDATA_PULSE_ARAL_H diff --git a/src/daemon/telemetry/telemetry-daemon-memory.c b/src/daemon/pulse/pulse-daemon-memory.c similarity index 96% rename from src/daemon/telemetry/telemetry-daemon-memory.c rename to src/daemon/pulse/pulse-daemon-memory.c index 19cfbc5e1b3d52..3c8ff3ecbd41b1 100644 --- a/src/daemon/telemetry/telemetry-daemon-memory.c +++ b/src/daemon/pulse/pulse-daemon-memory.c @@ -1,14 +1,15 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#define TELEMETRY_INTERNALS 1 -#include "telemetry-daemon-memory.h" +#define PULSE_INTERNALS 1 +#include "pulse-daemon-memory.h" +#include "streaming/replication.h" #define dictionary_stats_memory_total(stats) \ ((stats).memory.dict + (stats).memory.values + (stats).memory.index) struct netdata_buffers_statistics netdata_buffers_statistics = {}; -void telemetry_daemon_memory_do(bool extended) { +void pulse_daemon_memory_do(bool extended) { { static RRDSET *st_memory = NULL; static RRDDIM *rd_database = NULL; @@ -44,7 +45,7 @@ void telemetry_daemon_memory_do(bool extended) { "Netdata Memory", "bytes", "netdata", - "stats", + "pulse", 130100, localhost->rrd_update_every, RRDSET_TYPE_STACKED); @@ -94,7 +95,7 @@ void telemetry_daemon_memory_do(bool extended) { string_statistics(NULL, NULL, NULL, NULL, NULL, &strings, NULL, NULL); rrddim_set_by_pointer(st_memory, rd_database, - (collected_number)telemetry_dbengine_total_memory + (collected_number)rrddim_db_memory_size); + (collected_number)pulse_dbengine_total_memory + (collected_number)rrddim_db_memory_size); #ifdef DICT_WITH_STATS rrddim_set_by_pointer(st_memory, rd_collectors, @@ -141,7 +142,7 @@ void telemetry_daemon_memory_do(bool extended) { (collected_number)dictionary_stats_memory_total(dictionary_stats_category_rrdlabels)); rrddim_set_by_pointer(st_memory, rd_ml, - (collected_number)telemetry_ml_get_current_memory_usage()); + (collected_number)pulse_ml_get_current_memory_usage()); rrddim_set_by_pointer(st_memory, rd_strings, (collected_number)strings); @@ -194,7 +195,7 @@ void telemetry_daemon_memory_do(bool extended) { "Netdata Memory Buffers", "bytes", "netdata", - "stats", + "pulse", 130101, localhost->rrd_update_every, RRDSET_TYPE_STACKED); diff --git a/src/daemon/telemetry/telemetry-daemon-memory.h b/src/daemon/pulse/pulse-daemon-memory.h similarity index 73% rename from src/daemon/telemetry/telemetry-daemon-memory.h rename to src/daemon/pulse/pulse-daemon-memory.h index f33b4d4a88219c..f2d3deb203122a 100644 --- a/src/daemon/telemetry/telemetry-daemon-memory.h +++ b/src/daemon/pulse/pulse-daemon-memory.h @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#ifndef NETDATA_TELEMETRY_DAEMON_MEMORY_H -#define NETDATA_TELEMETRY_DAEMON_MEMORY_H +#ifndef NETDATA_PULSE_DAEMON_MEMORY_H +#define NETDATA_PULSE_DAEMON_MEMORY_H #include "daemon/common.h" @@ -22,8 +22,8 @@ extern struct netdata_buffers_statistics { size_t buffers_web; } netdata_buffers_statistics; -#if defined(TELEMETRY_INTERNALS) -void telemetry_daemon_memory_do(bool extended); +#if defined(PULSE_INTERNALS) +void pulse_daemon_memory_do(bool extended); #endif -#endif //NETDATA_TELEMETRY_DAEMON_MEMORY_H +#endif //NETDATA_PULSE_DAEMON_MEMORY_H diff --git a/src/daemon/telemetry/telemetry-daemon.c b/src/daemon/pulse/pulse-daemon.c similarity index 83% rename from src/daemon/telemetry/telemetry-daemon.c rename to src/daemon/pulse/pulse-daemon.c index 7478520a222e51..6883d1bba28e3e 100644 --- a/src/daemon/telemetry/telemetry-daemon.c +++ b/src/daemon/pulse/pulse-daemon.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#define TELEMETRY_INTERNALS 1 -#include "telemetry-daemon.h" +#define PULSE_INTERNALS 1 +#include "pulse-daemon.h" -static void telemetry_daemon_cpu_usage_do(bool extended __maybe_unused) { +static void pulse_daemon_cpu_usage_do(bool extended __maybe_unused) { struct rusage me; getrusage(RUSAGE_SELF, &me); @@ -22,7 +22,7 @@ static void telemetry_daemon_cpu_usage_do(bool extended __maybe_unused) { , "Netdata CPU usage" , "milliseconds/s" , "netdata" - , "stats" + , "pulse" , 130000 , localhost->rrd_update_every , RRDSET_TYPE_STACKED @@ -38,7 +38,7 @@ static void telemetry_daemon_cpu_usage_do(bool extended __maybe_unused) { } } -static void telemetry_daemon_uptime_do(bool extended __maybe_unused) { +static void pulse_daemon_uptime_do(bool extended __maybe_unused) { { static time_t netdata_boottime_time = 0; if (!netdata_boottime_time) @@ -59,7 +59,7 @@ static void telemetry_daemon_uptime_do(bool extended __maybe_unused) { "Netdata uptime", "seconds", "netdata", - "stats", + "pulse", 130150, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -72,8 +72,8 @@ static void telemetry_daemon_uptime_do(bool extended __maybe_unused) { } } -void telemetry_daemon_do(bool extended) { - telemetry_daemon_cpu_usage_do(extended); - telemetry_daemon_uptime_do(extended); - telemetry_daemon_memory_do(extended); +void pulse_daemon_do(bool extended) { + pulse_daemon_cpu_usage_do(extended); + pulse_daemon_uptime_do(extended); + pulse_daemon_memory_do(extended); } diff --git a/src/daemon/pulse/pulse-daemon.h b/src/daemon/pulse/pulse-daemon.h new file mode 100644 index 00000000000000..7cc8c1a46064eb --- /dev/null +++ b/src/daemon/pulse/pulse-daemon.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_PULSE_DAEMON_H +#define NETDATA_PULSE_DAEMON_H + +#include "daemon/common.h" + +#if defined(PULSE_INTERNALS) +void pulse_daemon_do(bool extended); +#endif + +#endif //NETDATA_PULSE_DAEMON_H diff --git a/src/daemon/telemetry/telemetry-dbengine.c b/src/daemon/pulse/pulse-dbengine.c similarity index 98% rename from src/daemon/telemetry/telemetry-dbengine.c rename to src/daemon/pulse/pulse-dbengine.c index 74e72d2f11e64b..39baa453d1c7ab 100644 --- a/src/daemon/telemetry/telemetry-dbengine.c +++ b/src/daemon/pulse/pulse-dbengine.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#define TELEMETRY_INTERNALS 1 -#include "telemetry-dbengine.h" +#define PULSE_INTERNALS 1 +#include "pulse-dbengine.h" -size_t telemetry_dbengine_total_memory = 0; +size_t pulse_dbengine_total_memory = 0; #if defined(ENABLE_DBENGINE) @@ -111,7 +111,7 @@ static void dbengine2_cache_statistics_charts(struct dbengine2_cache_pointers *p buffer_tostring(title), "%", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -159,7 +159,7 @@ static void dbengine2_cache_statistics_charts(struct dbengine2_cache_pointers *p buffer_tostring(title), "ops/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -213,7 +213,7 @@ static void dbengine2_cache_statistics_charts(struct dbengine2_cache_pointers *p buffer_tostring(title), "bytes", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_STACKED); @@ -285,7 +285,7 @@ static void dbengine2_cache_statistics_charts(struct dbengine2_cache_pointers *p buffer_tostring(title), "pages", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_HEATMAP); @@ -331,7 +331,7 @@ static void dbengine2_cache_statistics_charts(struct dbengine2_cache_pointers *p buffer_tostring(title), "bytes", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -381,7 +381,7 @@ static void dbengine2_cache_statistics_charts(struct dbengine2_cache_pointers *p buffer_tostring(title), "pages", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -425,7 +425,7 @@ static void dbengine2_cache_statistics_charts(struct dbengine2_cache_pointers *p buffer_tostring(title), "bytes/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_AREA); @@ -467,7 +467,7 @@ static void dbengine2_cache_statistics_charts(struct dbengine2_cache_pointers *p buffer_tostring(title), "bytes/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_AREA); @@ -507,7 +507,7 @@ static void dbengine2_cache_statistics_charts(struct dbengine2_cache_pointers *p buffer_tostring(title), "events/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_AREA); @@ -549,7 +549,7 @@ static void dbengine2_cache_statistics_charts(struct dbengine2_cache_pointers *p buffer_tostring(title), "events/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -609,7 +609,7 @@ static void dbengine2_cache_statistics_charts(struct dbengine2_cache_pointers *p buffer_tostring(title), "workers", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -639,7 +639,7 @@ static void dbengine2_cache_statistics_charts(struct dbengine2_cache_pointers *p } -void telemetry_dbengine_do(bool extended) { +void pulse_dbengine_do(bool extended) { if(!main_cache || !main_mrg || !extended) return; @@ -676,7 +676,7 @@ void telemetry_dbengine_do(bool extended) { buffers_total_size += buffers.julyl; #endif - telemetry_dbengine_total_memory = pgc_main_stats.size + pgc_open_stats.size + pgc_extent_stats.size + mrg_stats.size + buffers_total_size; + pulse_dbengine_total_memory = pgc_main_stats.size + pgc_open_stats.size + pgc_extent_stats.size + mrg_stats.size + buffers_total_size; size_t priority = 135000; @@ -698,7 +698,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata DB Memory", "bytes", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_STACKED); @@ -751,7 +751,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata DB Buffers", "bytes", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_STACKED); @@ -812,7 +812,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata JulyL Memory Moved", "bytes/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_AREA); @@ -844,7 +844,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata Metrics in Metrics Registry", "metrics", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -880,7 +880,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata Metrics Registry Operations", "metrics", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -912,7 +912,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata Metrics Registry References", "references", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -943,7 +943,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata Queries Cache Hit Ratio", "%", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -1020,7 +1020,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata Queries", "queries/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -1056,7 +1056,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata Queries Running", "queries", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -1086,7 +1086,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata Query Pages Metadata Source", "pages/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_STACKED); @@ -1120,7 +1120,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata Query Pages to Data Source", "pages/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_STACKED); @@ -1156,7 +1156,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata Query Next Page", "pages/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_STACKED); @@ -1197,7 +1197,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata Query Next Page Issues", "pages/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_STACKED); @@ -1244,7 +1244,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata Query Pages Loaded from Disk", "pages/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -1296,7 +1296,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata Database Events", "events/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -1340,7 +1340,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata Query Preparation Timings", "usec/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_STACKED); @@ -1381,7 +1381,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata Query Timings", "usec/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_STACKED); @@ -1438,7 +1438,7 @@ void telemetry_dbengine_do(bool extended) { rrd_rdunlock(); if (dbengine_contexts) { - /* deduplicate telemetry by getting the ones from the last context */ + /* deduplicate by getting the ones from the last context */ stats_array[30] = local_stats_array[30]; stats_array[31] = local_stats_array[31]; stats_array[32] = local_stats_array[32]; @@ -1461,7 +1461,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata DB engine data extents' compression savings ratio", "percentage", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -1502,7 +1502,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata DB engine I/O throughput", "MiB/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -1534,7 +1534,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata DB engine I/O operations", "operations/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -1567,7 +1567,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata DB engine errors", "errors/s", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); @@ -1602,7 +1602,7 @@ void telemetry_dbengine_do(bool extended) { "Netdata DB engine File Descriptors", "descriptors", "netdata", - "stats", + "pulse", priority, localhost->rrd_update_every, RRDSET_TYPE_LINE); diff --git a/src/daemon/pulse/pulse-dbengine.h b/src/daemon/pulse/pulse-dbengine.h new file mode 100644 index 00000000000000..cb7a7001abb651 --- /dev/null +++ b/src/daemon/pulse/pulse-dbengine.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_PULSE_DBENGINE_H +#define NETDATA_PULSE_DBENGINE_H + +#include "daemon/common.h" + +#if defined(PULSE_INTERNALS) +extern size_t pulse_dbengine_total_memory; + +#if defined(ENABLE_DBENGINE) +void pulse_dbengine_do(bool extended); +#endif + +#endif + +#endif //NETDATA_PULSE_DBENGINE_H diff --git a/src/daemon/telemetry/telemetry-dictionary.c b/src/daemon/pulse/pulse-dictionary.c similarity index 98% rename from src/daemon/telemetry/telemetry-dictionary.c rename to src/daemon/pulse/pulse-dictionary.c index 75424cb0702865..4677ae7864a75f 100644 --- a/src/daemon/telemetry/telemetry-dictionary.c +++ b/src/daemon/pulse/pulse-dictionary.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#define TELEMETRY_INTERNALS 1 -#include "telemetry-dictionary.h" +#define PULSE_INTERNALS 1 +#include "pulse-dictionary.h" struct dictionary_stats dictionary_stats_category_collectors = { .name = "collectors" }; struct dictionary_stats dictionary_stats_category_rrdhost = { .name = "rrdhost" }; @@ -104,7 +104,7 @@ static void update_dictionary_category_charts(struct dictionary_categories *c) { , "Dictionaries" , "dictionaries" , "netdata" - , "stats" + , "pulse" , priority + 0 , localhost->rrd_update_every , RRDSET_TYPE_LINE @@ -145,7 +145,7 @@ static void update_dictionary_category_charts(struct dictionary_categories *c) { , "Dictionary Items" , "items" , "netdata" - , "stats" + , "pulse" , priority + 1 , localhost->rrd_update_every , RRDSET_TYPE_LINE @@ -195,7 +195,7 @@ static void update_dictionary_category_charts(struct dictionary_categories *c) { , "Dictionary Operations" , "ops/s" , "netdata" - , "stats" + , "pulse" , priority + 2 , localhost->rrd_update_every , RRDSET_TYPE_LINE @@ -254,7 +254,7 @@ static void update_dictionary_category_charts(struct dictionary_categories *c) { , "Dictionary Callbacks" , "callbacks/s" , "netdata" - , "stats" + , "pulse" , priority + 3 , localhost->rrd_update_every , RRDSET_TYPE_LINE @@ -300,7 +300,7 @@ static void update_dictionary_category_charts(struct dictionary_categories *c) { , "Dictionary Memory" , "bytes" , "netdata" - , "stats" + , "pulse" , priority + 4 , localhost->rrd_update_every , RRDSET_TYPE_STACKED @@ -345,7 +345,7 @@ static void update_dictionary_category_charts(struct dictionary_categories *c) { , "Dictionary Spins" , "count" , "netdata" - , "stats" + , "pulse" , priority + 5 , localhost->rrd_update_every , RRDSET_TYPE_LINE @@ -368,7 +368,7 @@ static void update_dictionary_category_charts(struct dictionary_categories *c) { } } -void telemetry_dictionary_do(bool extended) { +void pulse_dictionary_do(bool extended) { if(!extended) return; for(int i = 0; dictionary_categories[i].stats ;i++) { diff --git a/src/daemon/telemetry/telemetry-dictionary.h b/src/daemon/pulse/pulse-dictionary.h similarity index 78% rename from src/daemon/telemetry/telemetry-dictionary.h rename to src/daemon/pulse/pulse-dictionary.h index 2e9ab8201bdba4..2b97eecc147012 100644 --- a/src/daemon/telemetry/telemetry-dictionary.h +++ b/src/daemon/pulse/pulse-dictionary.h @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#ifndef NETDATA_TELEMETRY_DICTIONARY_H -#define NETDATA_TELEMETRY_DICTIONARY_H +#ifndef NETDATA_PULSE_DICTIONARY_H +#define NETDATA_PULSE_DICTIONARY_H #include "daemon/common.h" @@ -17,8 +17,8 @@ extern struct dictionary_stats dictionary_stats_category_replication; extern size_t rrddim_db_memory_size; -#if defined(TELEMETRY_INTERNALS) -void telemetry_dictionary_do(bool extended); +#if defined(PULSE_INTERNALS) +void pulse_dictionary_do(bool extended); #endif -#endif //NETDATA_TELEMETRY_DICTIONARY_H +#endif //NETDATA_PULSE_DICTIONARY_H diff --git a/src/daemon/telemetry/telemetry-gorilla.c b/src/daemon/pulse/pulse-gorilla.c similarity index 93% rename from src/daemon/telemetry/telemetry-gorilla.c rename to src/daemon/pulse/pulse-gorilla.c index 441dcffe504faf..3d771e0c20df8c 100644 --- a/src/daemon/telemetry/telemetry-gorilla.c +++ b/src/daemon/pulse/pulse-gorilla.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#define TELEMETRY_INTERNALS 1 -#include "telemetry-gorilla.h" +#define PULSE_INTERNALS 1 +#include "pulse-gorilla.h" static struct gorilla_statistics { bool enabled; @@ -13,13 +13,13 @@ static struct gorilla_statistics { alignas(64) uint64_t gorilla_tier0_disk_original_bytes; } gorilla_statistics = { 0 }; -void telemetry_gorilla_hot_buffer_added() { +void pulse_gorilla_hot_buffer_added() { if(!gorilla_statistics.enabled) return; __atomic_fetch_add(&gorilla_statistics.tier0_hot_gorilla_buffers, 1, __ATOMIC_RELAXED); } -void telemetry_gorilla_tier0_page_flush(uint32_t actual, uint32_t optimal, uint32_t original) { +void pulse_gorilla_tier0_page_flush(uint32_t actual, uint32_t optimal, uint32_t original) { if(!gorilla_statistics.enabled) return; __atomic_fetch_add(&gorilla_statistics.gorilla_tier0_disk_actual_bytes, actual, __ATOMIC_RELAXED); @@ -34,7 +34,7 @@ static inline void global_statistics_copy(struct gorilla_statistics *gs) { gs->gorilla_tier0_disk_original_bytes = __atomic_load_n(&gorilla_statistics.gorilla_tier0_disk_original_bytes, __ATOMIC_RELAXED); } -void telemetry_gorilla_do(bool extended __maybe_unused) { +void pulse_gorilla_do(bool extended __maybe_unused) { #ifdef ENABLE_DBENGINE if(!extended) return; gorilla_statistics.enabled = true; @@ -57,7 +57,7 @@ void telemetry_gorilla_do(bool extended __maybe_unused) { , "Number of gorilla_pages" , "count" , "netdata" - , "stats" + , "pulse" , 131004 , localhost->rrd_update_every , RRDSET_TYPE_LINE @@ -89,7 +89,7 @@ void telemetry_gorilla_do(bool extended __maybe_unused) { , "DBENGINE Gorilla Compression Efficiency on Tier 0" , "bytes" , "netdata" - , "stats" + , "pulse" , 131005 , localhost->rrd_update_every , RRDSET_TYPE_LINE diff --git a/src/daemon/pulse/pulse-gorilla.h b/src/daemon/pulse/pulse-gorilla.h new file mode 100644 index 00000000000000..ef9975b42172d6 --- /dev/null +++ b/src/daemon/pulse/pulse-gorilla.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_PULSE_GORILLA_H +#define NETDATA_PULSE_GORILLA_H + +#include "daemon/common.h" + +void pulse_gorilla_hot_buffer_added(); +void pulse_gorilla_tier0_page_flush(uint32_t actual, uint32_t optimal, uint32_t original); + +#if defined(PULSE_INTERNALS) +void pulse_gorilla_do(bool extended); +#endif + +#endif //NETDATA_PULSE_GORILLA_H diff --git a/src/daemon/telemetry/telemetry-heartbeat.c b/src/daemon/pulse/pulse-heartbeat.c similarity index 91% rename from src/daemon/telemetry/telemetry-heartbeat.c rename to src/daemon/pulse/pulse-heartbeat.c index c66c9a04065ab9..c13c9ce1c7f68d 100644 --- a/src/daemon/telemetry/telemetry-heartbeat.c +++ b/src/daemon/pulse/pulse-heartbeat.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#define TELEMETRY_INTERNALS 1 -#include "telemetry-heartbeat.h" +#define PULSE_INTERNALS 1 +#include "pulse-heartbeat.h" -void telemetry_heartbeat_do(bool extended) { +void pulse_heartbeat_do(bool extended) { if(!extended) return; static RRDSET *st_heartbeat = NULL; @@ -21,7 +21,7 @@ void telemetry_heartbeat_do(bool extended) { , "System clock jitter" , "microseconds" , "netdata" - , "stats" + , "pulse" , 900000 , localhost->rrd_update_every , RRDSET_TYPE_AREA); diff --git a/src/daemon/pulse/pulse-heartbeat.h b/src/daemon/pulse/pulse-heartbeat.h new file mode 100644 index 00000000000000..ea75a54218f76c --- /dev/null +++ b/src/daemon/pulse/pulse-heartbeat.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_PULSE_HEARTBEAT_H +#define NETDATA_PULSE_HEARTBEAT_H + +#include "daemon/common.h" + +#if defined(PULSE_INTERNALS) +void pulse_heartbeat_do(bool extended); +#endif + +#endif //NETDATA_PULSE_HEARTBEAT_H diff --git a/src/daemon/telemetry/telemetry-http-api.c b/src/daemon/pulse/pulse-http-api.c similarity index 94% rename from src/daemon/telemetry/telemetry-http-api.c rename to src/daemon/pulse/pulse-http-api.c index 4211050df4c6a7..a4f035cb293e8d 100644 --- a/src/daemon/telemetry/telemetry-http-api.c +++ b/src/daemon/pulse/pulse-http-api.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#define TELEMETRY_INTERNALS 1 -#include "telemetry-http-api.h" +#define PULSE_INTERNALS 1 +#include "pulse-http-api.h" #define GLOBAL_STATS_RESET_WEB_USEC_MAX 0x01 @@ -21,16 +21,16 @@ static struct web_statistics { uint64_t content_size_compressed; } web_statistics; -uint64_t telemetry_web_client_connected(void) { +uint64_t pulse_web_client_connected(void) { __atomic_fetch_add(&web_statistics.connected_clients, 1, __ATOMIC_RELAXED); return __atomic_fetch_add(&web_statistics.web_client_count, 1, __ATOMIC_RELAXED); } -void telemetry_web_client_disconnected(void) { +void pulse_web_client_disconnected(void) { __atomic_fetch_sub(&web_statistics.connected_clients, 1, __ATOMIC_RELAXED); } -void telemetry_web_request_completed(uint64_t dt, +void pulse_web_request_completed(uint64_t dt, uint64_t bytes_received, uint64_t bytes_sent, uint64_t content_size, @@ -47,7 +47,7 @@ void telemetry_web_request_completed(uint64_t dt, __atomic_fetch_add(&web_statistics.content_size_compressed, compressed_content_size, __ATOMIC_RELAXED); } -static inline void telemetry_web_copy(struct web_statistics *gs, uint8_t options) { +static inline void pulse_web_copy(struct web_statistics *gs, uint8_t options) { gs->connected_clients = __atomic_load_n(&web_statistics.connected_clients, __ATOMIC_RELAXED); gs->web_requests = __atomic_load_n(&web_statistics.web_requests, __ATOMIC_RELAXED); gs->web_usec = __atomic_load_n(&web_statistics.web_usec, __ATOMIC_RELAXED); @@ -64,9 +64,9 @@ static inline void telemetry_web_copy(struct web_statistics *gs, uint8_t options } } -void telemetry_web_do(bool extended) { +void pulse_web_do(bool extended) { static struct web_statistics gs; - telemetry_web_copy(&gs, GLOBAL_STATS_RESET_WEB_USEC_MAX); + pulse_web_copy(&gs, GLOBAL_STATS_RESET_WEB_USEC_MAX); // ---------------------------------------------------------------- @@ -84,7 +84,7 @@ void telemetry_web_do(bool extended) { , "Netdata Web API Clients" , "connected clients" , "netdata" - , "stats" + , "pulse" , 130200 , localhost->rrd_update_every , RRDSET_TYPE_LINE @@ -113,7 +113,7 @@ void telemetry_web_do(bool extended) { , "Netdata Web API Requests Received" , "requests/s" , "netdata" - , "stats" + , "pulse" , 130300 , localhost->rrd_update_every , RRDSET_TYPE_LINE @@ -143,7 +143,7 @@ void telemetry_web_do(bool extended) { , "Netdata Web API Network Traffic" , "kilobits/s" , "netdata" - , "stats" + , "pulse" , 130400 , localhost->rrd_update_every , RRDSET_TYPE_AREA @@ -178,7 +178,7 @@ void telemetry_web_do(bool extended) { , "Netdata Web API Response Time" , "milliseconds/request" , "netdata" - , "stats" + , "pulse" , 130500 , localhost->rrd_update_every , RRDSET_TYPE_LINE @@ -232,7 +232,7 @@ void telemetry_web_do(bool extended) { , "Netdata Web API Responses Compression Savings Ratio" , "percentage" , "netdata" - , "stats" + , "pulse" , 130600 , localhost->rrd_update_every , RRDSET_TYPE_LINE @@ -241,7 +241,7 @@ void telemetry_web_do(bool extended) { rd_savings = rrddim_add(st_compression, "savings", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE); } - // since we don't lock here to read the telemetry + // since we don't lock here to read the data // read the smaller value first unsigned long long gcompressed_content_size = gs.content_size_compressed; unsigned long long gcontent_size = gs.content_size_uncompressed; diff --git a/src/daemon/telemetry/telemetry-http-api.h b/src/daemon/pulse/pulse-http-api.h similarity index 50% rename from src/daemon/telemetry/telemetry-http-api.h rename to src/daemon/pulse/pulse-http-api.h index 2b1ad38d1259a9..b5569acf0462e8 100644 --- a/src/daemon/telemetry/telemetry-http-api.h +++ b/src/daemon/pulse/pulse-http-api.h @@ -1,21 +1,21 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#ifndef NETDATA_TELEMETRY_HTTP_API_H -#define NETDATA_TELEMETRY_HTTP_API_H +#ifndef NETDATA_PULSE_HTTP_API_H +#define NETDATA_PULSE_HTTP_API_H #include "daemon/common.h" -uint64_t telemetry_web_client_connected(void); -void telemetry_web_client_disconnected(void); +uint64_t pulse_web_client_connected(void); +void pulse_web_client_disconnected(void); -void telemetry_web_request_completed(uint64_t dt, +void pulse_web_request_completed(uint64_t dt, uint64_t bytes_received, uint64_t bytes_sent, uint64_t content_size, uint64_t compressed_content_size); -#if defined(TELEMETRY_INTERNALS) -void telemetry_web_do(bool extended); +#if defined(PULSE_INTERNALS) +void pulse_web_do(bool extended); #endif -#endif //NETDATA_TELEMETRY_HTTP_API_H +#endif //NETDATA_PULSE_HTTP_API_H diff --git a/src/daemon/telemetry/telemetry-ingestion.c b/src/daemon/pulse/pulse-ingestion.c similarity index 83% rename from src/daemon/telemetry/telemetry-ingestion.c rename to src/daemon/pulse/pulse-ingestion.c index d1b2d03ec351b1..d31e28d0151da1 100644 --- a/src/daemon/telemetry/telemetry-ingestion.c +++ b/src/daemon/pulse/pulse-ingestion.c @@ -1,27 +1,27 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#define TELEMETRY_INTERNALS 1 -#include "telemetry-ingestion.h" +#define PULSE_INTERNALS 1 +#include "pulse-ingestion.h" static struct ingest_statistics { uint64_t db_points_stored_per_tier[RRD_STORAGE_TIERS]; } ingest_statistics; -void telemetry_queries_rrdset_collection_completed(size_t *points_read_per_tier_array) { +void pulse_queries_rrdset_collection_completed(size_t *points_read_per_tier_array) { for(size_t tier = 0; tier < storage_tiers ;tier++) { __atomic_fetch_add(&ingest_statistics.db_points_stored_per_tier[tier], points_read_per_tier_array[tier], __ATOMIC_RELAXED); points_read_per_tier_array[tier] = 0; } } -static inline void telemetry_ingestion_copy(struct ingest_statistics *gs) { +static inline void pulse_ingestion_copy(struct ingest_statistics *gs) { for(size_t tier = 0; tier < storage_tiers ;tier++) gs->db_points_stored_per_tier[tier] = __atomic_load_n(&ingest_statistics.db_points_stored_per_tier[tier], __ATOMIC_RELAXED); } -void telemetry_ingestion_do(bool extended __maybe_unused) { +void pulse_ingestion_do(bool extended __maybe_unused) { static struct ingest_statistics gs; - telemetry_ingestion_copy(&gs); + pulse_ingestion_copy(&gs); { static RRDSET *st_points_stored = NULL; @@ -37,7 +37,7 @@ void telemetry_ingestion_do(bool extended __maybe_unused) { , "Netdata Time-Series Collected Samples" , "samples/s" , "netdata" - , "stats" + , "pulse" , 131003 , localhost->rrd_update_every , RRDSET_TYPE_STACKED diff --git a/src/daemon/pulse/pulse-ingestion.h b/src/daemon/pulse/pulse-ingestion.h new file mode 100644 index 00000000000000..30d522b0672dad --- /dev/null +++ b/src/daemon/pulse/pulse-ingestion.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_PULSE_INGESTION_H +#define NETDATA_PULSE_INGESTION_H + +#include "daemon/common.h" + +void pulse_queries_rrdset_collection_completed(size_t *points_read_per_tier_array); + +#if defined(PULSE_INTERNALS) +void pulse_ingestion_do(bool extended); +#endif + +#endif //NETDATA_PULSE_INGESTION_H diff --git a/src/daemon/telemetry/telemetry-ml.c b/src/daemon/pulse/pulse-ml.c similarity index 87% rename from src/daemon/telemetry/telemetry-ml.c rename to src/daemon/pulse/pulse-ml.c index e127850a90fd3c..7995dcfce47b05 100644 --- a/src/daemon/telemetry/telemetry-ml.c +++ b/src/daemon/pulse/pulse-ml.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#define TELEMETRY_INTERNALS 1 -#include "telemetry-ml.h" +#define PULSE_INTERNALS 1 +#include "pulse-ml.h" static struct ml_statistics { alignas(64) uint64_t ml_models_consulted; @@ -14,17 +14,17 @@ static struct ml_statistics { alignas(64) uint64_t ml_memory_delete; } ml_statistics = {0}; -void telemetry_ml_models_received() +void pulse_ml_models_received() { __atomic_fetch_add(&ml_statistics.ml_models_received, 1, __ATOMIC_RELAXED); } -void telemetry_ml_models_ignored() +void pulse_ml_models_ignored() { __atomic_fetch_add(&ml_statistics.ml_models_ignored, 1, __ATOMIC_RELAXED); } -void telemetry_ml_models_sent() +void pulse_ml_models_sent() { __atomic_fetch_add(&ml_statistics.ml_models_sent, 1, __ATOMIC_RELAXED); } @@ -34,24 +34,24 @@ void global_statistics_ml_models_deserialization_failures() __atomic_fetch_add(&ml_statistics.ml_models_deserialization_failures, 1, __ATOMIC_RELAXED); } -void telemetry_ml_models_consulted(size_t models_consulted) +void pulse_ml_models_consulted(size_t models_consulted) { __atomic_fetch_add(&ml_statistics.ml_models_consulted, models_consulted, __ATOMIC_RELAXED); } -void telemetry_ml_memory_allocated(size_t n) +void pulse_ml_memory_allocated(size_t n) { __atomic_fetch_add(&ml_statistics.ml_memory_consumption, n, __ATOMIC_RELAXED); __atomic_fetch_add(&ml_statistics.ml_memory_new, 1, __ATOMIC_RELAXED); } -void telemetry_ml_memory_freed(size_t n) +void pulse_ml_memory_freed(size_t n) { __atomic_fetch_sub(&ml_statistics.ml_memory_consumption, n, __ATOMIC_RELAXED); __atomic_fetch_add(&ml_statistics.ml_memory_delete, 1, __ATOMIC_RELAXED); } -uint64_t telemetry_ml_get_current_memory_usage(void) { +uint64_t pulse_ml_get_current_memory_usage(void) { return __atomic_load_n(&ml_statistics.ml_memory_consumption, __ATOMIC_RELAXED); } @@ -69,7 +69,7 @@ static inline void ml_statistics_copy(struct ml_statistics *gs) gs->ml_memory_delete = __atomic_load_n(&ml_statistics.ml_memory_delete, __ATOMIC_RELAXED); } -void telemetry_ml_do(bool extended) +void pulse_ml_do(bool extended) { if (!extended) return; diff --git a/src/daemon/pulse/pulse-ml.h b/src/daemon/pulse/pulse-ml.h new file mode 100644 index 00000000000000..545cfafa65584b --- /dev/null +++ b/src/daemon/pulse/pulse-ml.h @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_PULSE_ML_H +#define NETDATA_PULSE_ML_H + +#include "daemon/common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void pulse_ml_models_consulted(size_t models_consulted); +void pulse_ml_models_received(); +void pulse_ml_models_ignored(); +void pulse_ml_models_sent(); + +void pulse_ml_memory_allocated(size_t n); +void pulse_ml_memory_freed(size_t n); + +void global_statistics_ml_models_deserialization_failures(); + +uint64_t pulse_ml_get_current_memory_usage(void); + +#if defined(PULSE_INTERNALS) +void pulse_ml_do(bool extended); +#endif + +#ifdef __cplusplus +} +#endif + + +#endif //NETDATA_PULSE_ML_H diff --git a/src/daemon/telemetry/telemetry-queries.c b/src/daemon/pulse/pulse-queries.c similarity index 95% rename from src/daemon/telemetry/telemetry-queries.c rename to src/daemon/pulse/pulse-queries.c index c4ec819c65854f..6d55405032805c 100644 --- a/src/daemon/telemetry/telemetry-queries.c +++ b/src/daemon/pulse/pulse-queries.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#define TELEMETRY_INTERNALS 1 -#include "telemetry-queries.h" +#define PULSE_INTERNALS 1 +#include "pulse-queries.h" +#include "streaming/replication.h" static struct query_statistics { uint64_t api_data_queries_made; @@ -31,22 +32,22 @@ static struct query_statistics { uint64_t exporters_db_points_read; } query_statistics; -void telemetry_queries_ml_query_completed(size_t points_read) { +void pulse_queries_ml_query_completed(size_t points_read) { __atomic_fetch_add(&query_statistics.ml_queries_made, 1, __ATOMIC_RELAXED); __atomic_fetch_add(&query_statistics.ml_db_points_read, points_read, __ATOMIC_RELAXED); } -void telemetry_queries_exporters_query_completed(size_t points_read) { +void pulse_queries_exporters_query_completed(size_t points_read) { __atomic_fetch_add(&query_statistics.exporters_queries_made, 1, __ATOMIC_RELAXED); __atomic_fetch_add(&query_statistics.exporters_db_points_read, points_read, __ATOMIC_RELAXED); } -void telemetry_queries_backfill_query_completed(size_t points_read) { +void pulse_queries_backfill_query_completed(size_t points_read) { __atomic_fetch_add(&query_statistics.backfill_queries_made, 1, __ATOMIC_RELAXED); __atomic_fetch_add(&query_statistics.backfill_db_points_read, points_read, __ATOMIC_RELAXED); } -void telemetry_queries_rrdr_query_completed(size_t queries, uint64_t db_points_read, uint64_t result_points_generated, QUERY_SOURCE query_source) { +void pulse_queries_rrdr_query_completed(size_t queries, uint64_t db_points_read, uint64_t result_points_generated, QUERY_SOURCE query_source) { switch(query_source) { case QUERY_SOURCE_API_DATA: __atomic_fetch_add(&query_statistics.api_data_queries_made, queries, __ATOMIC_RELAXED); @@ -85,7 +86,7 @@ void telemetry_queries_rrdr_query_completed(size_t queries, uint64_t db_points_r } } -static inline void telemetry_queries_copy(struct query_statistics *gs) { +static inline void pulse_queries_copy(struct query_statistics *gs) { gs->api_data_queries_made = __atomic_load_n(&query_statistics.api_data_queries_made, __ATOMIC_RELAXED); gs->api_data_db_points_read = __atomic_load_n(&query_statistics.api_data_db_points_read, __ATOMIC_RELAXED); gs->api_data_result_points_generated = __atomic_load_n(&query_statistics.api_data_result_points_generated, __ATOMIC_RELAXED); @@ -112,9 +113,9 @@ static inline void telemetry_queries_copy(struct query_statistics *gs) { gs->backfill_db_points_read = __atomic_load_n(&query_statistics.backfill_db_points_read, __ATOMIC_RELAXED); } -void telemetry_queries_do(bool extended __maybe_unused) { +void pulse_queries_do(bool extended __maybe_unused) { static struct query_statistics gs; - telemetry_queries_copy(&gs); + pulse_queries_copy(&gs); struct replication_query_statistics replication = replication_get_query_statistics(); @@ -139,7 +140,7 @@ void telemetry_queries_do(bool extended __maybe_unused) { , "Netdata Time-Series DB Queries" , "queries/s" , "netdata" - , "stats" + , "pulse" , 131000 , localhost->rrd_update_every , RRDSET_TYPE_STACKED @@ -188,7 +189,7 @@ void telemetry_queries_do(bool extended __maybe_unused) { , "Netdata Time-Series DB Samples Read" , "points/s" , "netdata" - , "stats" + , "pulse" , 131001 , localhost->rrd_update_every , RRDSET_TYPE_STACKED @@ -235,7 +236,7 @@ void telemetry_queries_do(bool extended __maybe_unused) { , "Netdata Time-Series Samples Generated" , "points/s" , "netdata" - , "stats" + , "pulse" , 131002 , localhost->rrd_update_every , RRDSET_TYPE_STACKED diff --git a/src/daemon/pulse/pulse-queries.h b/src/daemon/pulse/pulse-queries.h new file mode 100644 index 00000000000000..59e28faffc4276 --- /dev/null +++ b/src/daemon/pulse/pulse-queries.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_PULSE_QUERIES_H +#define NETDATA_PULSE_QUERIES_H + +#include "daemon/common.h" + +void pulse_queries_ml_query_completed(size_t points_read); +void pulse_queries_exporters_query_completed(size_t points_read); +void pulse_queries_backfill_query_completed(size_t points_read); +void pulse_queries_rrdr_query_completed(size_t queries, uint64_t db_points_read, uint64_t result_points_generated, QUERY_SOURCE query_source); + +#if defined(PULSE_INTERNALS) +void pulse_queries_do(bool extended); +#endif + +#endif //NETDATA_PULSE_QUERIES_H diff --git a/src/daemon/telemetry/telemetry-sqlite3.c b/src/daemon/pulse/pulse-sqlite3.c similarity index 97% rename from src/daemon/telemetry/telemetry-sqlite3.c rename to src/daemon/pulse/pulse-sqlite3.c index da160067e07cd2..b2ab6445acb9fa 100644 --- a/src/daemon/telemetry/telemetry-sqlite3.c +++ b/src/daemon/pulse/pulse-sqlite3.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#define TELEMETRY_INTERNALS 1 -#include "telemetry-sqlite3.h" +#define PULSE_INTERNALS 1 +#include "pulse-sqlite3.h" static struct sqlite3_statistics { bool enabled; @@ -22,7 +22,7 @@ static struct sqlite3_statistics { alignas(64) uint64_t sqlite3_context_cache_write; } sqlite3_statistics = { }; -void telemetry_sqlite3_query_completed(bool success, bool busy, bool locked) { +void pulse_sqlite3_query_completed(bool success, bool busy, bool locked) { if(!sqlite3_statistics.enabled) return; __atomic_fetch_add(&sqlite3_statistics.sqlite3_queries_made, 1, __ATOMIC_RELAXED); @@ -41,7 +41,7 @@ void telemetry_sqlite3_query_completed(bool success, bool busy, bool locked) { } } -void telemetry_sqlite3_row_completed(void) { +void pulse_sqlite3_row_completed(void) { if(!sqlite3_statistics.enabled) return; __atomic_fetch_add(&sqlite3_statistics.sqlite3_rows, 1, __ATOMIC_RELAXED); @@ -123,7 +123,7 @@ static inline void sqlite3_statistics_copy(struct sqlite3_statistics *gs) { last_run = now_monotonic_usec(); } -void telemetry_sqlite3_do(bool extended) { +void pulse_sqlite3_do(bool extended) { if(!extended) return; sqlite3_statistics.enabled = true; @@ -144,7 +144,7 @@ void telemetry_sqlite3_do(bool extended) { , "Netdata SQLite3 Queries" , "queries/s" , "netdata" - , "stats" + , "pulse" , 131100 , localhost->rrd_update_every , RRDSET_TYPE_LINE @@ -174,7 +174,7 @@ void telemetry_sqlite3_do(bool extended) { , "Netdata SQLite3 Queries by status" , "queries/s" , "netdata" - , "stats" + , "pulse" , 131101 , localhost->rrd_update_every , RRDSET_TYPE_LINE @@ -210,7 +210,7 @@ void telemetry_sqlite3_do(bool extended) { , "Netdata SQLite3 Rows" , "rows/s" , "netdata" - , "stats" + , "pulse" , 131102 , localhost->rrd_update_every , RRDSET_TYPE_LINE @@ -241,7 +241,7 @@ void telemetry_sqlite3_do(bool extended) { , "Netdata SQLite3 metadata cache" , "ops/s" , "netdata" - , "stats" + , "pulse" , 131103 , localhost->rrd_update_every , RRDSET_TYPE_LINE @@ -285,7 +285,7 @@ void telemetry_sqlite3_do(bool extended) { , "Netdata SQLite3 context cache" , "ops/s" , "netdata" - , "stats" + , "pulse" , 131104 , localhost->rrd_update_every , RRDSET_TYPE_LINE diff --git a/src/daemon/pulse/pulse-sqlite3.h b/src/daemon/pulse/pulse-sqlite3.h new file mode 100644 index 00000000000000..865a1f67a45dfb --- /dev/null +++ b/src/daemon/pulse/pulse-sqlite3.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_PULSE_SQLITE3_H +#define NETDATA_PULSE_SQLITE3_H + +#include "daemon/common.h" + +void pulse_sqlite3_query_completed(bool success, bool busy, bool locked); +void pulse_sqlite3_row_completed(void); + +#if defined(PULSE_INTERNALS) +void pulse_sqlite3_do(bool extended); +#endif + +#endif //NETDATA_PULSE_SQLITE3_H diff --git a/src/daemon/telemetry/telemetry-string.c b/src/daemon/pulse/pulse-string.c similarity index 95% rename from src/daemon/telemetry/telemetry-string.c rename to src/daemon/pulse/pulse-string.c index 33922ae9a30fec..ca35f020eb234b 100644 --- a/src/daemon/telemetry/telemetry-string.c +++ b/src/daemon/pulse/pulse-string.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#define TELEMETRY_INTERNALS 1 -#include "telemetry-string.h" +#define PULSE_INTERNALS 1 +#include "pulse-string.h" -void telemetry_string_do(bool extended) { +void pulse_string_do(bool extended) { if(!extended) return; static RRDSET *st_ops = NULL, *st_entries = NULL, *st_mem = NULL; @@ -28,7 +28,7 @@ void telemetry_string_do(bool extended) { , "Strings operations" , "ops/s" , "netdata" - , "stats" + , "pulse" , 910000 , localhost->rrd_update_every , RRDSET_TYPE_LINE); @@ -61,7 +61,7 @@ void telemetry_string_do(bool extended) { , "Strings entries" , "entries" , "netdata" - , "stats" + , "pulse" , 910001 , localhost->rrd_update_every , RRDSET_TYPE_AREA); @@ -88,7 +88,7 @@ void telemetry_string_do(bool extended) { , "Strings memory" , "bytes" , "netdata" - , "stats" + , "pulse" , 910001 , localhost->rrd_update_every , RRDSET_TYPE_AREA); diff --git a/src/daemon/pulse/pulse-string.h b/src/daemon/pulse/pulse-string.h new file mode 100644 index 00000000000000..86291086280ea1 --- /dev/null +++ b/src/daemon/pulse/pulse-string.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_PULSE_STRING_H +#define NETDATA_PULSE_STRING_H + +#include "daemon/common.h" + +#if defined(PULSE_INTERNALS) +void pulse_string_do(bool extended); +#endif + +#endif //NETDATA_PULSE_STRING_H diff --git a/src/daemon/telemetry/telemetry-trace-allocations.c b/src/daemon/pulse/pulse-trace-allocations.c similarity index 95% rename from src/daemon/telemetry/telemetry-trace-allocations.c rename to src/daemon/pulse/pulse-trace-allocations.c index a0ee095aebff4c..d910f479bbeb33 100644 --- a/src/daemon/telemetry/telemetry-trace-allocations.c +++ b/src/daemon/pulse/pulse-trace-allocations.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#define TELEMETRY_INTERNALS 1 -#include "telemetry-trace-allocations.h" +#define PULSE_INTERNALS 1 +#include "pulse-trace-allocations.h" #ifdef NETDATA_TRACE_ALLOCATIONS @@ -58,7 +58,7 @@ static int do_memory_trace_item(void *item, void *data) { return 1; } -void telemetry_trace_allocations_do(bool extended) { +void pulse_trace_allocations_do(bool extended) { if(!extended) return; static struct memory_trace_data tmp = { @@ -78,7 +78,7 @@ void telemetry_trace_allocations_do(bool extended) { , "Netdata Memory Used by Function" , "bytes" , "netdata" - , "stats" + , "pulse" , 900000 , localhost->rrd_update_every , RRDSET_TYPE_STACKED @@ -95,7 +95,7 @@ void telemetry_trace_allocations_do(bool extended) { , "Netdata Memory Operations by Function" , "ops/s" , "netdata" - , "stats" + , "pulse" , 900001 , localhost->rrd_update_every , RRDSET_TYPE_LINE @@ -112,7 +112,7 @@ void telemetry_trace_allocations_do(bool extended) { , "Netdata Memory Allocations by Function" , "allocations" , "netdata" - , "stats" + , "pulse" , 900002 , localhost->rrd_update_every , RRDSET_TYPE_STACKED @@ -129,7 +129,7 @@ void telemetry_trace_allocations_do(bool extended) { , "Netdata Average Allocation Size by Function" , "bytes" , "netdata" - , "stats" + , "pulse" , 900003 , localhost->rrd_update_every , RRDSET_TYPE_LINE diff --git a/src/daemon/pulse/pulse-trace-allocations.h b/src/daemon/pulse/pulse-trace-allocations.h new file mode 100644 index 00000000000000..e4dd9f4cf4b8d6 --- /dev/null +++ b/src/daemon/pulse/pulse-trace-allocations.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_PULSE_TRACE_ALLOCATIONS_H +#define NETDATA_PULSE_TRACE_ALLOCATIONS_H + +#include "daemon/common.h" + +#if defined(PULSE_INTERNALS) +#ifdef NETDATA_TRACE_ALLOCATIONS +void pulse_trace_allocations_do(bool extended); +#endif +#endif + +#endif //NETDATA_PULSE_TRACE_ALLOCATIONS_H diff --git a/src/daemon/telemetry/telemetry-workers.c b/src/daemon/pulse/pulse-workers.c similarity index 77% rename from src/daemon/telemetry/telemetry-workers.c rename to src/daemon/pulse/pulse-workers.c index fd01ab1225f89f..e2e00c80e2bb8e 100644 --- a/src/daemon/telemetry/telemetry-workers.c +++ b/src/daemon/pulse/pulse-workers.c @@ -1,10 +1,21 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#define TELEMETRY_INTERNALS 1 -#include "telemetry-workers.h" +#define PULSE_INTERNALS 1 +#include "pulse-workers.h" #define WORKERS_MIN_PERCENT_DEFAULT 10000.0 +struct worker_spinlocks { + size_t locks; + size_t spins; + + RRDDIM *rd_locks; + RRDDIM *rd_spins; +}; + +DEFINE_JUDYL_TYPED(SPINLOCKS, struct worker_spinlocks *); +SPINLOCKS_JudyLSet ALL_SPINLOCKS = { 0 }; + struct worker_job_type_gs { STRING *name; STRING *units; @@ -94,10 +105,14 @@ struct worker_utilization { RRDSET *st_workers_busy_per_job_type; RRDDIM *rd_total_cpu_utilizaton; + + RRDSET *st_spinlocks_locks; + RRDSET *st_spinlocks_spins; + SPINLOCKS_JudyLSet spinlocks; }; static struct worker_utilization all_workers_utilization[] = { - { .name = "STATS", .family = "workers telemetry", .priority = 1000000 }, + { .name = "PULSE", .family = "workers pulse", .priority = 1000000 }, { .name = "HEALTH", .family = "workers health alarms", .priority = 1000000 }, { .name = "MLTRAIN", .family = "workers ML training", .priority = 1000000 }, { .name = "MLDETECT", .family = "workers ML detection", .priority = 1000000 }, @@ -132,6 +147,108 @@ static struct worker_utilization all_workers_utilization[] = { { .name = NULL, .family = NULL } }; +static void workers_total_spinlock_contention_chart(void) { + { + static RRDSET *st = NULL; + + if(unlikely(!st)) { + st = rrdset_create_localhost( + "netdata" + , "spinlock_total_locks" + , NULL + , "spinlocks" + , "netdata.spinlock_total_locks" + , "Netdata Total Spinlock Locks" + , "locks" + , "netdata" + , "pulse" + , 920000 + , localhost->rrd_update_every + , RRDSET_TYPE_LINE + ); + } + + Word_t idx = 0; + for(struct worker_spinlocks *wusp = SPINLOCKS_FIRST(&ALL_SPINLOCKS, &idx); + wusp; + wusp = SPINLOCKS_NEXT(&ALL_SPINLOCKS, &idx)) { + const char *func = (const char *)idx; + RRDDIM *rd = rrddim_find(st, func); + if(!rd) rd = rrddim_add(st, func, NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rrddim_set_by_pointer(st, rd, (collected_number)wusp->locks); + } + + rrdset_done(st); + } + + { + static RRDSET *st = NULL; + if(unlikely(!st)) { + st = rrdset_create_localhost( + "netdata" + , "spinlock_total_spins" + , NULL + , "spinlocks" + , "netdata.spinlock_total_spins" + , "Netdata Total Spinlock Spins" + , "spins" + , "netdata" + , "pulse" + , 920001 + , localhost->rrd_update_every + , RRDSET_TYPE_LINE + ); + } + + Word_t idx = 0; + for(struct worker_spinlocks *wusp = SPINLOCKS_FIRST(&ALL_SPINLOCKS, &idx); + wusp; + wusp = SPINLOCKS_NEXT(&ALL_SPINLOCKS, &idx)) { + const char *func = (const char *)idx; + RRDDIM *rd = rrddim_find(st, func); + if(!rd) rd = rrddim_add(st, func, NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rrddim_set_by_pointer(st, rd, (collected_number)wusp->spins); + } + + rrdset_done(st); + } + + { + static RRDSET *st = NULL; + if(unlikely(!st)) { + st = rrdset_create_localhost( + "netdata" + , "spinlock_total_spins_per_lock" + , NULL + , "spinlocks" + , "netdata.spinlock_total_spins_per_lock" + , "Netdata Average Spinlock Spins Per Lock" + , "spins" + , "netdata" + , "pulse" + , 920002 + , localhost->rrd_update_every + , RRDSET_TYPE_LINE + ); + } + + Word_t idx = 0; + for(struct worker_spinlocks *wusp = SPINLOCKS_FIRST(&ALL_SPINLOCKS, &idx); + wusp; + wusp = SPINLOCKS_NEXT(&ALL_SPINLOCKS, &idx)) { + const char *func = (const char *)idx; + RRDDIM *rd = rrddim_find(st, func); + if(!rd) rd = rrddim_add(st, func, NULL, 1, 10000, RRD_ALGORITHM_ABSOLUTE); + if(!wusp->locks) + rrddim_set_by_pointer(st, rd, 0); + else + rrddim_set_by_pointer(st, rd, (collected_number)((uint64_t)wusp->spins * 10000ULL / (uint64_t)wusp->locks)); + } + + rrdset_done(st); + } +} + static void workers_total_cpu_utilization_chart(void) { size_t i, cpu_enabled = 0; for(i = 0; all_workers_utilization[i].name ;i++) @@ -151,7 +268,7 @@ static void workers_total_cpu_utilization_chart(void) { "Netdata Workers CPU Utilization (100% = 1 core)", "%", "netdata", - "stats", + "pulse", 999000, localhost->rrd_update_every, RRDSET_TYPE_STACKED); @@ -199,7 +316,7 @@ static void workers_utilization_update_chart(struct worker_utilization *wu) { , "Netdata Workers Busy Time (100% = all workers busy)" , "%" , "netdata" - , "stats" + , "pulse" , wu->priority , localhost->rrd_update_every , RRDSET_TYPE_AREA @@ -252,7 +369,7 @@ static void workers_utilization_update_chart(struct worker_utilization *wu) { , "Netdata Workers CPU Utilization (100% = all workers busy)" , "%" , "netdata" - , "stats" + , "pulse" , wu->priority + 1 , localhost->rrd_update_every , RRDSET_TYPE_AREA @@ -303,7 +420,7 @@ static void workers_utilization_update_chart(struct worker_utilization *wu) { , "Netdata Workers Jobs Started by Type" , "jobs" , "netdata" - , "stats" + , "pulse" , wu->priority + 2 , localhost->rrd_update_every , RRDSET_TYPE_STACKED @@ -346,7 +463,7 @@ static void workers_utilization_update_chart(struct worker_utilization *wu) { , "Netdata Workers Busy Time by Type" , "ms" , "netdata" - , "stats" + , "pulse" , wu->priority + 3 , localhost->rrd_update_every , RRDSET_TYPE_STACKED @@ -390,7 +507,7 @@ static void workers_utilization_update_chart(struct worker_utilization *wu) { , "Netdata Workers Threads" , "threads" , "netdata" - , "stats" + , "pulse" , wu->priority + 4 , localhost->rrd_update_every , RRDSET_TYPE_STACKED @@ -405,6 +522,85 @@ static void workers_utilization_update_chart(struct worker_utilization *wu) { rrdset_done(wu->st_workers_threads); } + // ---------------------------------------------------------------------- + // spinlocks + + { + if(unlikely(!wu->st_spinlocks_locks)) { + char name[RRD_ID_LENGTH_MAX + 1]; + snprintfz(name, RRD_ID_LENGTH_MAX, "workers_spinlock_locks_%s", wu->name_lowercase); + + char context[RRD_ID_LENGTH_MAX + 1]; + snprintf(context, RRD_ID_LENGTH_MAX, "netdata.workers.%s.spinlock_locks", wu->name_lowercase); + + wu->st_spinlocks_locks = rrdset_create_localhost( + "netdata" + , name + , NULL + , wu->family + , context + , "Netdata Spinlock Locks" + , "locks" + , "netdata" + , "pulse" + , wu->priority + 5 + , localhost->rrd_update_every + , RRDSET_TYPE_LINE + ); + } + + Word_t idx = 0; + for(struct worker_spinlocks *wusp = SPINLOCKS_FIRST(&wu->spinlocks, &idx); + wusp; + wusp = SPINLOCKS_NEXT(&wu->spinlocks, &idx)) { + const char *func = (const char *)idx; + if(!wusp->rd_locks) + wusp->rd_locks = rrddim_add(wu->st_spinlocks_locks, func, NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + + rrddim_set_by_pointer(wu->st_spinlocks_locks, wusp->rd_locks, (collected_number)wusp->locks); + } + + rrdset_done(wu->st_spinlocks_locks); + } + + { + if(unlikely(!wu->st_spinlocks_spins)) { + char name[RRD_ID_LENGTH_MAX + 1]; + snprintfz(name, RRD_ID_LENGTH_MAX, "workers_spinlock_spins_%s", wu->name_lowercase); + + char context[RRD_ID_LENGTH_MAX + 1]; + snprintf(context, RRD_ID_LENGTH_MAX, "netdata.workers.%s.spinlock_spins", wu->name_lowercase); + + wu->st_spinlocks_spins = rrdset_create_localhost( + "netdata" + , name + , NULL + , wu->family + , context + , "Netdata Spinlock Spins" + , "spins" + , "netdata" + , "pulse" + , wu->priority + 6 + , localhost->rrd_update_every + , RRDSET_TYPE_LINE + ); + } + + Word_t idx = 0; + for(struct worker_spinlocks *wusp = SPINLOCKS_FIRST(&wu->spinlocks, &idx); + wusp; + wusp = SPINLOCKS_NEXT(&wu->spinlocks, &idx)) { + const char *func = (const char *)idx; + if(!wusp->rd_spins) + wusp->rd_spins = rrddim_add(wu->st_spinlocks_spins, func, NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + + rrddim_set_by_pointer(wu->st_spinlocks_spins, wusp->rd_spins, (collected_number)wusp->spins); + } + + rrdset_done(wu->st_spinlocks_spins); + } + // ---------------------------------------------------------------------- // custom metric types WORKER_METRIC_ABSOLUTE @@ -442,8 +638,8 @@ static void workers_utilization_update_chart(struct worker_utilization *wu) { , title , (wu->per_job_type[i].units)?string2str(wu->per_job_type[i].units):"value" , "netdata" - , "stats" - , wu->priority + 5 + i + , "pulse" + , wu->priority + 10 + i , localhost->rrd_update_every , RRDSET_TYPE_LINE ); @@ -498,8 +694,8 @@ static void workers_utilization_update_chart(struct worker_utilization *wu) { , title , (wu->per_job_type[i].units)?string2str(wu->per_job_type[i].units):"rate" , "netdata" - , "stats" - , wu->priority + 5 + i + , "pulse" + , wu->priority + 10 + i , localhost->rrd_update_every , RRDSET_TYPE_LINE ); @@ -519,6 +715,14 @@ static void workers_utilization_update_chart(struct worker_utilization *wu) { } static void workers_utilization_reset_statistics(struct worker_utilization *wu) { + Word_t idx = 0; + for(struct worker_spinlocks *wusp = SPINLOCKS_FIRST(&wu->spinlocks, &idx); + wusp; + wusp = SPINLOCKS_NEXT(&wu->spinlocks, &idx)) { + wusp->locks = 0; + wusp->spins = 0; + } + wu->workers_registered = 0; wu->workers_busy = 0; wu->workers_total_busy_time = 0; @@ -646,19 +850,22 @@ static struct worker_thread *worker_thread_find_or_create(struct worker_utilizat } static void worker_utilization_charts_callback(void *ptr - , pid_t pid __maybe_unused + , pid_t pid , const char *thread_tag __maybe_unused - , size_t max_job_id __maybe_unused - , size_t utilization_usec __maybe_unused - , size_t duration_usec __maybe_unused - , size_t jobs_started __maybe_unused - , size_t is_running __maybe_unused - , STRING **job_types_names __maybe_unused - , STRING **job_types_units __maybe_unused - , WORKER_METRIC_TYPE *job_types_metric_types __maybe_unused - , size_t *job_types_jobs_started __maybe_unused - , usec_t *job_types_busy_time __maybe_unused - , NETDATA_DOUBLE *job_types_custom_metrics __maybe_unused + , size_t max_job_id + , size_t utilization_usec + , size_t duration_usec + , size_t jobs_started + , size_t is_running + , STRING **job_types_names + , STRING **job_types_units + , WORKER_METRIC_TYPE *job_types_metric_types + , size_t *job_types_jobs_started + , usec_t *job_types_busy_time + , NETDATA_DOUBLE *job_types_custom_metrics + , const char *spinlock_functions[] + , size_t *spinlock_locks + , size_t *spinlock_spins ) { struct worker_utilization *wu = (struct worker_utilization *)ptr; @@ -693,8 +900,7 @@ static void worker_utilization_charts_callback(void *ptr wu->workers_min_busy_time = util; // accumulate per job type statistics - size_t i; - for(i = 0; i <= max_job_id ;i++) { + for(size_t i = 0; i <= max_job_id ;i++) { if(!wu->per_job_type[i].name && job_types_names[i]) wu->per_job_type[i].name = string_dup(job_types_names[i]); @@ -739,9 +945,31 @@ static void worker_utilization_charts_callback(void *ptr if(cpu > wu->workers_cpu_max) wu->workers_cpu_max = cpu; } wu->workers_cpu_registered += (wt->cpu_enabled) ? 1 : 0; + + // ---------------------------------------------------------------------------------------------------------------- + // spinlock contention + + // spinlocks + for(size_t i = 0; i < WORKER_SPINLOCK_CONTENTION_FUNCTIONS && spinlock_functions[i] ;i++) { + struct worker_spinlocks *wusp = SPINLOCKS_GET(&wu->spinlocks, (Word_t)spinlock_functions[i]); + if(!wusp) { + wusp = callocz(1, sizeof(*wusp)); + SPINLOCKS_SET(&wu->spinlocks, (Word_t)spinlock_functions[i], wusp); + } + wusp->locks += spinlock_locks[i]; + wusp->spins += spinlock_spins[i]; + + wusp = SPINLOCKS_GET(&ALL_SPINLOCKS, (Word_t)spinlock_functions[i]); + if(!wusp) { + wusp = callocz(1, sizeof(*wusp)); + SPINLOCKS_SET(&ALL_SPINLOCKS, (Word_t)spinlock_functions[i], wusp); + } + wusp->locks += spinlock_locks[i]; + wusp->spins += spinlock_spins[i]; + } } -void telemetry_workers_cleanup(void) { +void pulse_workers_cleanup(void) { int i, j; for(i = 0; all_workers_utilization[i].name ;i++) { struct worker_utilization *wu = &all_workers_utilization[i]; @@ -769,12 +997,20 @@ void telemetry_workers_cleanup(void) { } } -void telemetry_workers_do(bool extended) { +void pulse_workers_do(bool extended) { if(!extended) return; static size_t iterations = 0; iterations++; + Word_t idx = 0; + for(struct worker_spinlocks *wusp = SPINLOCKS_FIRST(&ALL_SPINLOCKS, &idx); + wusp; + wusp = SPINLOCKS_NEXT(&ALL_SPINLOCKS, &idx)) { + wusp->locks = 0; + wusp->spins = 0; + } + for(int i = 0; all_workers_utilization[i].name ;i++) { workers_utilization_reset_statistics(&all_workers_utilization[i]); @@ -788,4 +1024,5 @@ void telemetry_workers_do(bool extended) { } workers_total_cpu_utilization_chart(); + workers_total_spinlock_contention_chart(); } diff --git a/src/daemon/pulse/pulse-workers.h b/src/daemon/pulse/pulse-workers.h new file mode 100644 index 00000000000000..f2bb28a28853d5 --- /dev/null +++ b/src/daemon/pulse/pulse-workers.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_PULSE_WORKERS_H +#define NETDATA_PULSE_WORKERS_H + +#include "daemon/common.h" + +#if defined(PULSE_INTERNALS) +void pulse_workers_do(bool extended); +void pulse_workers_cleanup(void); +#endif + +#endif //NETDATA_PULSE_WORKERS_H diff --git a/src/daemon/telemetry/telemetry.c b/src/daemon/pulse/pulse.c similarity index 56% rename from src/daemon/telemetry/telemetry.c rename to src/daemon/pulse/pulse.c index 65bcbce56ca8ff..2c30e8931ab7eb 100644 --- a/src/daemon/telemetry/telemetry.c +++ b/src/daemon/pulse/pulse.c @@ -1,18 +1,18 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#define TELEMETRY_INTERNALS 1 +#define PULSE_INTERNALS 1 #include "daemon/common.h" -#define WORKER_JOB_TELEMETRY_DAEMON 0 +#define WORKER_JOB_DAEMON 0 #define WORKER_JOB_SQLITE3 1 -#define WORKER_JOB_TELEMETRY_HTTP_API 2 -#define WORKER_JOB_TELEMETRY_QUERIES 3 -#define WORKER_JOB_TELEMETRY_INGESTION 4 +#define WORKER_JOB_HTTP_API 2 +#define WORKER_JOB_QUERIES 3 +#define WORKER_JOB_INGESTION 4 #define WORKER_JOB_DBENGINE 5 #define WORKER_JOB_STRINGS 6 #define WORKER_JOB_DICTIONARIES 7 -#define WORKER_JOB_TELEMETRY_ML 8 -#define WORKER_JOB_TELEMETRY_GORILLA 9 +#define WORKER_JOB_ML 8 +#define WORKER_JOB_GORILLA 9 #define WORKER_JOB_HEARTBEAT 10 #define WORKER_JOB_WORKERS 11 #define WORKER_JOB_MALLOC_TRACE 12 @@ -23,22 +23,22 @@ #error "WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 14" #endif -bool telemetry_enabled = true; -bool telemetry_extended_enabled = false; +bool pulse_enabled = true; +bool pulse_extended_enabled = false; -static void telemetry_register_workers(void) { - worker_register("STATS"); +static void pulse_register_workers(void) { + worker_register("PULSE"); - worker_register_job_name(WORKER_JOB_TELEMETRY_DAEMON, "daemon"); + worker_register_job_name(WORKER_JOB_DAEMON, "daemon"); worker_register_job_name(WORKER_JOB_SQLITE3, "sqlite3"); - worker_register_job_name(WORKER_JOB_TELEMETRY_HTTP_API, "http-api"); - worker_register_job_name(WORKER_JOB_TELEMETRY_QUERIES, "queries"); - worker_register_job_name(WORKER_JOB_TELEMETRY_INGESTION, "ingestion"); + worker_register_job_name(WORKER_JOB_HTTP_API, "http-api"); + worker_register_job_name(WORKER_JOB_QUERIES, "queries"); + worker_register_job_name(WORKER_JOB_INGESTION, "ingestion"); worker_register_job_name(WORKER_JOB_DBENGINE, "dbengine"); worker_register_job_name(WORKER_JOB_STRINGS, "strings"); worker_register_job_name(WORKER_JOB_DICTIONARIES, "dictionaries"); - worker_register_job_name(WORKER_JOB_TELEMETRY_ML, "ML"); - worker_register_job_name(WORKER_JOB_TELEMETRY_GORILLA, "gorilla"); + worker_register_job_name(WORKER_JOB_ML, "ML"); + worker_register_job_name(WORKER_JOB_GORILLA, "gorilla"); worker_register_job_name(WORKER_JOB_HEARTBEAT, "heartbeat"); worker_register_job_name(WORKER_JOB_WORKERS, "workers"); worker_register_job_name(WORKER_JOB_MALLOC_TRACE, "malloc_trace"); @@ -46,32 +46,32 @@ static void telemetry_register_workers(void) { worker_register_job_name(WORKER_JOB_ARAL, "aral"); } -static void telementry_cleanup(void *pptr) +static void pulse_cleanup(void *pptr) { struct netdata_static_thread *static_thread = CLEANUP_FUNCTION_GET_PTR(pptr); if(!static_thread) return; static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; - telemetry_workers_cleanup(); + pulse_workers_cleanup(); worker_unregister(); netdata_log_info("cleaning up..."); static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; } -void *telemetry_thread_main(void *ptr) { - CLEANUP_FUNCTION_REGISTER(telementry_cleanup) cleanup_ptr = ptr; - telemetry_register_workers(); +void *pulse_thread_main(void *ptr) { + CLEANUP_FUNCTION_REGISTER(pulse_cleanup) cleanup_ptr = ptr; + pulse_register_workers(); int update_every = - (int)config_get_duration_seconds(CONFIG_SECTION_TELEMETRY, "update every", localhost->rrd_update_every); + (int)config_get_duration_seconds(CONFIG_SECTION_PULSE, "update every", localhost->rrd_update_every); if (update_every < localhost->rrd_update_every) { update_every = localhost->rrd_update_every; - config_set_duration_seconds(CONFIG_SECTION_TELEMETRY, "update every", update_every); + config_set_duration_seconds(CONFIG_SECTION_PULSE, "update every", update_every); } - telemerty_aral_init(); + pulse_aral_init(); usec_t step = update_every * USEC_PER_SEC; heartbeat_t hb; @@ -91,28 +91,28 @@ void *telemetry_thread_main(void *ptr) { } real_step = USEC_PER_SEC; - worker_is_busy(WORKER_JOB_TELEMETRY_INGESTION); - telemetry_ingestion_do(telemetry_extended_enabled); + worker_is_busy(WORKER_JOB_INGESTION); + pulse_ingestion_do(pulse_extended_enabled); - worker_is_busy(WORKER_JOB_TELEMETRY_HTTP_API); - telemetry_web_do(telemetry_extended_enabled); + worker_is_busy(WORKER_JOB_HTTP_API); + pulse_web_do(pulse_extended_enabled); - worker_is_busy(WORKER_JOB_TELEMETRY_QUERIES); - telemetry_queries_do(telemetry_extended_enabled); + worker_is_busy(WORKER_JOB_QUERIES); + pulse_queries_do(pulse_extended_enabled); - worker_is_busy(WORKER_JOB_TELEMETRY_ML); - telemetry_ml_do(telemetry_extended_enabled); + worker_is_busy(WORKER_JOB_ML); + pulse_ml_do(pulse_extended_enabled); - worker_is_busy(WORKER_JOB_TELEMETRY_GORILLA); - telemetry_gorilla_do(telemetry_extended_enabled); + worker_is_busy(WORKER_JOB_GORILLA); + pulse_gorilla_do(pulse_extended_enabled); worker_is_busy(WORKER_JOB_HEARTBEAT); - telemetry_heartbeat_do(telemetry_extended_enabled); + pulse_heartbeat_do(pulse_extended_enabled); #ifdef ENABLE_DBENGINE if(dbengine_enabled) { worker_is_busy(WORKER_JOB_DBENGINE); - telemetry_dbengine_do(telemetry_extended_enabled); + pulse_dbengine_do(pulse_extended_enabled); } #endif @@ -120,37 +120,37 @@ void *telemetry_thread_main(void *ptr) { registry_statistics(); worker_is_busy(WORKER_JOB_STRINGS); - telemetry_string_do(telemetry_extended_enabled); + pulse_string_do(pulse_extended_enabled); #ifdef DICT_WITH_STATS worker_is_busy(WORKER_JOB_DICTIONARIES); - telemetry_dictionary_do(telemetry_extended_enabled); + pulse_dictionary_do(pulse_extended_enabled); #endif #ifdef NETDATA_TRACE_ALLOCATIONS worker_is_busy(WORKER_JOB_MALLOC_TRACE); - telemetry_trace_allocations_do(telemetry_extended_enabled); + pulse_trace_allocations_do(pulse_extended_enabled); #endif worker_is_busy(WORKER_JOB_WORKERS); - telemetry_workers_do(telemetry_extended_enabled); + pulse_workers_do(pulse_extended_enabled); worker_is_busy(WORKER_JOB_ARAL); - telemetry_aral_do(telemetry_extended_enabled); + pulse_aral_do(pulse_extended_enabled); // keep this last to have access to the memory counters // exposed by everyone else - worker_is_busy(WORKER_JOB_TELEMETRY_DAEMON); - telemetry_daemon_do(telemetry_extended_enabled); + worker_is_busy(WORKER_JOB_DAEMON); + pulse_daemon_do(pulse_extended_enabled); } return NULL; } // --------------------------------------------------------------------------------------------------------------------- -// telemetry extended thread +// pulse sqlite3 thread -static void telemetry_thread_sqlite3_cleanup(void *pptr) +static void pulse_thread_sqlite3_cleanup(void *pptr) { struct netdata_static_thread *static_thread = CLEANUP_FUNCTION_GET_PTR(pptr); if (!static_thread) @@ -165,15 +165,15 @@ static void telemetry_thread_sqlite3_cleanup(void *pptr) static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; } -void *telemetry_thread_sqlite3_main(void *ptr) { - CLEANUP_FUNCTION_REGISTER(telemetry_thread_sqlite3_cleanup) cleanup_ptr = ptr; - telemetry_register_workers(); +void *pulse_thread_sqlite3_main(void *ptr) { + CLEANUP_FUNCTION_REGISTER(pulse_thread_sqlite3_cleanup) cleanup_ptr = ptr; + pulse_register_workers(); int update_every = - (int)config_get_duration_seconds(CONFIG_SECTION_TELEMETRY, "update every", localhost->rrd_update_every); + (int)config_get_duration_seconds(CONFIG_SECTION_PULSE, "update every", localhost->rrd_update_every); if (update_every < localhost->rrd_update_every) { update_every = localhost->rrd_update_every; - config_set_duration_seconds(CONFIG_SECTION_TELEMETRY, "update every", update_every); + config_set_duration_seconds(CONFIG_SECTION_PULSE, "update every", update_every); } usec_t step = update_every * USEC_PER_SEC; @@ -191,7 +191,7 @@ void *telemetry_thread_sqlite3_main(void *ptr) { real_step = USEC_PER_SEC; worker_is_busy(WORKER_JOB_SQLITE3); - telemetry_sqlite3_do(telemetry_extended_enabled); + pulse_sqlite3_do(pulse_extended_enabled); } return NULL; diff --git a/src/daemon/pulse/pulse.h b/src/daemon/pulse/pulse.h new file mode 100644 index 00000000000000..6aca15326a768e --- /dev/null +++ b/src/daemon/pulse/pulse.h @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_PULSE_H +#define NETDATA_PULSE_H 1 + +#include "database/rrd.h" + +extern bool pulse_enabled; +extern bool pulse_extended_enabled; + +#include "pulse-http-api.h" +#include "pulse-queries.h" +#include "pulse-ingestion.h" +#include "pulse-ml.h" +#include "pulse-gorilla.h" +#include "pulse-daemon.h" +#include "pulse-daemon-memory.h" +#include "pulse-sqlite3.h" +#include "pulse-dbengine.h" +#include "pulse-string.h" +#include "pulse-heartbeat.h" +#include "pulse-dictionary.h" +#include "pulse-workers.h" +#include "pulse-trace-allocations.h" +#include "pulse-aral.h" + +void *pulse_thread_main(void *ptr); +void *pulse_thread_sqlite3_main(void *ptr); + +#endif /* NETDATA_PULSE_H */ diff --git a/src/daemon/static_threads.c b/src/daemon/static_threads.c index 91a20a85a33fa1..fa4de32e183e49 100644 --- a/src/daemon/static_threads.c +++ b/src/daemon/static_threads.c @@ -12,7 +12,7 @@ void *statsd_main(void *ptr); void *profile_main(void *ptr); void *replication_thread_main(void *ptr); -extern bool telemetry_enabled; +extern bool pulse_enabled; const struct netdata_static_thread static_threads_common[] = { { @@ -43,26 +43,26 @@ const struct netdata_static_thread static_threads_common[] = { .start_routine = analytics_main }, { - .name = "TELEMETRY", + .name = "PULSE", .config_section = CONFIG_SECTION_PLUGINS, - .config_name = "netdata telemetry", + .config_name = "netdata pulse", .env_name = "NETDATA_INTERNALS_MONITORING", - .global_variable = &telemetry_enabled, + .global_variable = &pulse_enabled, .enabled = 1, .thread = NULL, .init_routine = NULL, - .start_routine = telemetry_thread_main + .start_routine = pulse_thread_main }, { - .name = "TLMTRY-SQLITE3", - .config_section = CONFIG_SECTION_TELEMETRY, - .config_name = "extended telemetry", + .name = "PULSE-SQLITE3", + .config_section = CONFIG_SECTION_PULSE, + .config_name = "extended", .env_name = NULL, - .global_variable = &telemetry_extended_enabled, + .global_variable = &pulse_extended_enabled, .enabled = 0, // the default value - it uses netdata.conf for users to enable it .thread = NULL, .init_routine = NULL, - .start_routine = telemetry_thread_sqlite3_main + .start_routine = pulse_thread_sqlite3_main }, { .name = "PLUGINSD", diff --git a/src/daemon/telemetry/telemetry-aral.h b/src/daemon/telemetry/telemetry-aral.h deleted file mode 100644 index 26c08ffda4de0b..00000000000000 --- a/src/daemon/telemetry/telemetry-aral.h +++ /dev/null @@ -1,16 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_TELEMETRY_ARAL_H -#define NETDATA_TELEMETRY_ARAL_H - -#include "daemon/common.h" - -void telemetry_aral_register(ARAL *ar, const char *name); -void telemetry_aral_unregister(ARAL *ar); - -#if defined(TELEMETRY_INTERNALS) -void telemerty_aral_init(void); -void telemetry_aral_do(bool extended); -#endif - -#endif //NETDATA_TELEMETRY_ARAL_H diff --git a/src/daemon/telemetry/telemetry-daemon.h b/src/daemon/telemetry/telemetry-daemon.h deleted file mode 100644 index 44023312e39acf..00000000000000 --- a/src/daemon/telemetry/telemetry-daemon.h +++ /dev/null @@ -1,12 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_TELEMETRY_DAEMON_H -#define NETDATA_TELEMETRY_DAEMON_H - -#include "daemon/common.h" - -#if defined(TELEMETRY_INTERNALS) -void telemetry_daemon_do(bool extended); -#endif - -#endif //NETDATA_TELEMETRY_DAEMON_H diff --git a/src/daemon/telemetry/telemetry-dbengine.h b/src/daemon/telemetry/telemetry-dbengine.h deleted file mode 100644 index af120501e1e980..00000000000000 --- a/src/daemon/telemetry/telemetry-dbengine.h +++ /dev/null @@ -1,17 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_TELEMETRY_DBENGINE_H -#define NETDATA_TELEMETRY_DBENGINE_H - -#include "daemon/common.h" - -#if defined(TELEMETRY_INTERNALS) -extern size_t telemetry_dbengine_total_memory; - -#if defined(ENABLE_DBENGINE) -void telemetry_dbengine_do(bool extended); -#endif - -#endif - -#endif //NETDATA_TELEMETRY_DBENGINE_H diff --git a/src/daemon/telemetry/telemetry-gorilla.h b/src/daemon/telemetry/telemetry-gorilla.h deleted file mode 100644 index 845e2c42868d7c..00000000000000 --- a/src/daemon/telemetry/telemetry-gorilla.h +++ /dev/null @@ -1,15 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_TELEMETRY_GORILLA_H -#define NETDATA_TELEMETRY_GORILLA_H - -#include "daemon/common.h" - -void telemetry_gorilla_hot_buffer_added(); -void telemetry_gorilla_tier0_page_flush(uint32_t actual, uint32_t optimal, uint32_t original); - -#if defined(TELEMETRY_INTERNALS) -void telemetry_gorilla_do(bool extended); -#endif - -#endif //NETDATA_TELEMETRY_GORILLA_H diff --git a/src/daemon/telemetry/telemetry-heartbeat.h b/src/daemon/telemetry/telemetry-heartbeat.h deleted file mode 100644 index c8a021a7ee7243..00000000000000 --- a/src/daemon/telemetry/telemetry-heartbeat.h +++ /dev/null @@ -1,12 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_TELEMETRY_HEARTBEAT_H -#define NETDATA_TELEMETRY_HEARTBEAT_H - -#include "daemon/common.h" - -#if defined(TELEMETRY_INTERNALS) -void telemetry_heartbeat_do(bool extended); -#endif - -#endif //NETDATA_TELEMETRY_HEARTBEAT_H diff --git a/src/daemon/telemetry/telemetry-ingestion.h b/src/daemon/telemetry/telemetry-ingestion.h deleted file mode 100644 index ab72ea0e8a0c4e..00000000000000 --- a/src/daemon/telemetry/telemetry-ingestion.h +++ /dev/null @@ -1,14 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_TELEMETRY_INGESTION_H -#define NETDATA_TELEMETRY_INGESTION_H - -#include "daemon/common.h" - -void telemetry_queries_rrdset_collection_completed(size_t *points_read_per_tier_array); - -#if defined(TELEMETRY_INTERNALS) -void telemetry_ingestion_do(bool extended); -#endif - -#endif //NETDATA_TELEMETRY_INGESTION_H diff --git a/src/daemon/telemetry/telemetry-ml.h b/src/daemon/telemetry/telemetry-ml.h deleted file mode 100644 index ff992cbe2ffdf3..00000000000000 --- a/src/daemon/telemetry/telemetry-ml.h +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_TELEMETRY_ML_H -#define NETDATA_TELEMETRY_ML_H - -#include "daemon/common.h" - -#ifdef __cplusplus -extern "C" { -#endif - -void telemetry_ml_models_consulted(size_t models_consulted); -void telemetry_ml_models_received(); -void telemetry_ml_models_ignored(); -void telemetry_ml_models_sent(); - -void telemetry_ml_memory_allocated(size_t n); -void telemetry_ml_memory_freed(size_t n); - -void global_statistics_ml_models_deserialization_failures(); - -uint64_t telemetry_ml_get_current_memory_usage(void); - -#if defined(TELEMETRY_INTERNALS) -void telemetry_ml_do(bool extended); -#endif - -#ifdef __cplusplus -} -#endif - - -#endif //NETDATA_TELEMETRY_ML_H diff --git a/src/daemon/telemetry/telemetry-queries.h b/src/daemon/telemetry/telemetry-queries.h deleted file mode 100644 index 67c0a3679ca480..00000000000000 --- a/src/daemon/telemetry/telemetry-queries.h +++ /dev/null @@ -1,17 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_TELEMETRY_QUERIES_H -#define NETDATA_TELEMETRY_QUERIES_H - -#include "daemon/common.h" - -void telemetry_queries_ml_query_completed(size_t points_read); -void telemetry_queries_exporters_query_completed(size_t points_read); -void telemetry_queries_backfill_query_completed(size_t points_read); -void telemetry_queries_rrdr_query_completed(size_t queries, uint64_t db_points_read, uint64_t result_points_generated, QUERY_SOURCE query_source); - -#if defined(TELEMETRY_INTERNALS) -void telemetry_queries_do(bool extended); -#endif - -#endif //NETDATA_TELEMETRY_QUERIES_H diff --git a/src/daemon/telemetry/telemetry-sqlite3.h b/src/daemon/telemetry/telemetry-sqlite3.h deleted file mode 100644 index 1c124dfa15cf97..00000000000000 --- a/src/daemon/telemetry/telemetry-sqlite3.h +++ /dev/null @@ -1,15 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_TELEMETRY_SQLITE3_H -#define NETDATA_TELEMETRY_SQLITE3_H - -#include "daemon/common.h" - -void telemetry_sqlite3_query_completed(bool success, bool busy, bool locked); -void telemetry_sqlite3_row_completed(void); - -#if defined(TELEMETRY_INTERNALS) -void telemetry_sqlite3_do(bool extended); -#endif - -#endif //NETDATA_TELEMETRY_SQLITE3_H diff --git a/src/daemon/telemetry/telemetry-string.h b/src/daemon/telemetry/telemetry-string.h deleted file mode 100644 index 21fd08127deba7..00000000000000 --- a/src/daemon/telemetry/telemetry-string.h +++ /dev/null @@ -1,12 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_TELEMETRY_STRING_H -#define NETDATA_TELEMETRY_STRING_H - -#include "daemon/common.h" - -#if defined(TELEMETRY_INTERNALS) -void telemetry_string_do(bool extended); -#endif - -#endif //NETDATA_TELEMETRY_STRING_H diff --git a/src/daemon/telemetry/telemetry-trace-allocations.h b/src/daemon/telemetry/telemetry-trace-allocations.h deleted file mode 100644 index c44bf2e3ebb31b..00000000000000 --- a/src/daemon/telemetry/telemetry-trace-allocations.h +++ /dev/null @@ -1,14 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_TELEMETRY_TRACE_ALLOCATIONS_H -#define NETDATA_TELEMETRY_TRACE_ALLOCATIONS_H - -#include "daemon/common.h" - -#if defined(TELEMETRY_INTERNALS) -#ifdef NETDATA_TRACE_ALLOCATIONS -void telemetry_trace_allocations_do(bool extended); -#endif -#endif - -#endif //NETDATA_TELEMETRY_TRACE_ALLOCATIONS_H diff --git a/src/daemon/telemetry/telemetry-workers.h b/src/daemon/telemetry/telemetry-workers.h deleted file mode 100644 index 02acd59a982e44..00000000000000 --- a/src/daemon/telemetry/telemetry-workers.h +++ /dev/null @@ -1,13 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_TELEMETRY_WORKERS_H -#define NETDATA_TELEMETRY_WORKERS_H - -#include "daemon/common.h" - -#if defined(TELEMETRY_INTERNALS) -void telemetry_workers_do(bool extended); -void telemetry_workers_cleanup(void); -#endif - -#endif //NETDATA_TELEMETRY_WORKERS_H diff --git a/src/daemon/telemetry/telemetry.h b/src/daemon/telemetry/telemetry.h deleted file mode 100644 index 54f5357ac6c57c..00000000000000 --- a/src/daemon/telemetry/telemetry.h +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_TELEMETRY_H -#define NETDATA_TELEMETRY_H 1 - -#include "database/rrd.h" - -extern bool telemetry_enabled; -extern bool telemetry_extended_enabled; - -#include "telemetry-http-api.h" -#include "telemetry-queries.h" -#include "telemetry-ingestion.h" -#include "telemetry-ml.h" -#include "telemetry-gorilla.h" -#include "telemetry-daemon.h" -#include "telemetry-daemon-memory.h" -#include "telemetry-sqlite3.h" -#include "telemetry-dbengine.h" -#include "telemetry-string.h" -#include "telemetry-heartbeat.h" -#include "telemetry-dictionary.h" -#include "telemetry-workers.h" -#include "telemetry-trace-allocations.h" -#include "telemetry-aral.h" - -void *telemetry_thread_main(void *ptr); -void *telemetry_thread_sqlite3_main(void *ptr); - -#endif /* NETDATA_TELEMETRY_H */ diff --git a/src/database/contexts/query_target.c b/src/database/contexts/query_target.c index b25b8e427263ef..316155471eda10 100644 --- a/src/database/contexts/query_target.c +++ b/src/database/contexts/query_target.c @@ -27,12 +27,12 @@ static struct { } used; } query_target_base = { .available = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .base = NULL, .count = 0, }, .used = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .base = NULL, .count = 0, }, diff --git a/src/database/engine/cache.c b/src/database/engine/cache.c index 1e1e137eaf895b..013e0c1f8d1bb4 100644 --- a/src/database/engine/cache.c +++ b/src/database/engine/cache.c @@ -121,11 +121,10 @@ struct pgc { struct pgc_index { alignas(64) RW_SPINLOCK rw_spinlock; Pvoid_t sections_judy; - } *index; - #ifdef PGC_WITH_ARAL - ARAL *aral; + ARAL *aral; #endif + } *index; struct { alignas(64) SPINLOCK spinlock; @@ -399,6 +398,7 @@ static inline size_t cache_usage_per1000(PGC *cache, size_t *size_to_evict) { if(current_cache_size > wanted_cache_size && wanted_cache_size < current_cache_size - clean) wanted_cache_size = current_cache_size - clean; + bool signal_the_evictor = false; if(cache->config.out_of_memory_protection_bytes) { // out of memory protection OS_SYSTEM_MEMORY sm = os_system_memory(false); @@ -409,6 +409,7 @@ static inline size_t cache_usage_per1000(PGC *cache, size_t *size_to_evict) { if (sm.ram_available_bytes < min_available) { // we must shrink wanted_cache_size = current_cache_size - (min_available - sm.ram_available_bytes); + signal_the_evictor = true; } else if(cache->config.use_all_ram) { // we can grow @@ -443,6 +444,12 @@ static inline size_t cache_usage_per1000(PGC *cache, size_t *size_to_evict) { else if(per1000 >= cache->config.aggressive_evict_per1000) __atomic_add_fetch(&cache->stats.events_cache_needs_space_aggressively, 1, __ATOMIC_RELAXED); + if (signal_the_evictor && spinlock_trylock(&cache->evictor.spinlock)) { + completion_mark_complete_a_job(&cache->evictor.completion); + spinlock_unlock(&cache->evictor.spinlock); + __atomic_add_fetch(&cache->stats.waste_evict_thread_signals, 1, __ATOMIC_RELAXED); + } + return per1000; } @@ -464,12 +471,6 @@ static bool flush_pages(PGC *cache, size_t max_flushes, Word_t section, bool wai static void signal_evict_thread_or_evict_inline(PGC *cache, bool on_release) { const size_t per1000 = cache_usage_per1000(cache, NULL); - if (per1000 >= cache->config.healthy_size_per1000 && spinlock_trylock(&cache->evictor.spinlock)) { - __atomic_add_fetch(&cache->stats.waste_evict_thread_signals, 1, __ATOMIC_RELAXED); - completion_mark_complete_a_job(&cache->evictor.completion); - spinlock_unlock(&cache->evictor.spinlock); - } - if(!(cache->config.options & PGC_OPTIONS_EVICT_PAGES_NO_INLINE)) { if (per1000 > cache->config.aggressive_evict_per1000 && !on_release) { // the threads that add pages, turn into evictors when the cache needs evictions aggressively @@ -560,26 +561,15 @@ struct section_pages { static struct aral_statistics aral_statistics_for_pgc = { 0 }; static ARAL *pgc_sections_aral = NULL; -static ARAL *pgc_pages_aral = NULL; static void pgc_section_pages_static_aral_init(void) { - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; spinlock_lock(&spinlock); if(!pgc_sections_aral) pgc_sections_aral = aral_by_size_acquire(sizeof(struct section_pages)); - if(!pgc_pages_aral) { - pgc_pages_aral = aral_create( - "pgc_pages", - sizeof(PGC_PAGE), - 0, - 0, - &aral_statistics_for_pgc, - NULL, NULL, false, false); - } - spinlock_unlock(&spinlock); } @@ -646,14 +636,12 @@ static void pgc_queue_add(PGC *cache __maybe_unused, struct pgc_queue *q, PGC_PA // - New pages created as CLEAN, always have 1 access. // - DIRTY pages made CLEAN, depending on their accesses may be appended (accesses > 0) or prepended (accesses = 0). - // FIXME - is it better for fragmentation to always append? - -// if(page->accesses || page_flag_check(page, PGC_PAGE_HAS_BEEN_ACCESSED | PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES) == PGC_PAGE_HAS_BEEN_ACCESSED) { + if(page->accesses || page_flag_check(page, PGC_PAGE_HAS_BEEN_ACCESSED | PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES) == PGC_PAGE_HAS_BEEN_ACCESSED) { DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(q->base, page, link.prev, link.next); page_flag_clear(page, PGC_PAGE_HAS_BEEN_ACCESSED); -// } -// else -// DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(q->base, page, link.prev, link.next); + } + else + DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(q->base, page, link.prev, link.next); q->version++; } @@ -1037,7 +1025,7 @@ static inline void free_this_page(PGC *cache, PGC_PAGE *page, size_t partition _ // free our memory #ifdef PGC_WITH_ARAL - aral_freez(cache->aral, page); + aral_freez(cache->index[partition].aral, page); #else freez(page); #endif @@ -1431,7 +1419,7 @@ static PGC_PAGE *page_add(PGC *cache, PGC_ENTRY *entry, bool *added) { size_t partition = pgc_indexing_partition(cache, entry->metric_id); #ifdef PGC_WITH_ARAL - PGC_PAGE *allocation = aral_mallocz(cache->aral); + PGC_PAGE *allocation = aral_mallocz(cache->index[partition].aral); #endif PGC_PAGE *page; size_t spins = 0; @@ -1534,7 +1522,7 @@ static PGC_PAGE *page_add(PGC *cache, PGC_ENTRY *entry, bool *added) { if(unlikely(!page)) { // now that we don't have the lock, // give it some time for the old page to go away - yield_the_processor(); + tinysleep(); } } @@ -1542,7 +1530,7 @@ static PGC_PAGE *page_add(PGC *cache, PGC_ENTRY *entry, bool *added) { #ifdef PGC_WITH_ARAL if(allocation) - aral_freez(cache->aral, allocation); + aral_freez(cache->index[partition].aral, allocation); #endif __atomic_sub_fetch(&cache->stats.workers_add, 1, __ATOMIC_RELAXED); @@ -1950,7 +1938,9 @@ static void *pgc_evict_thread(void *ptr) { while (true) { worker_is_idle(); - unsigned new_job_id = completion_wait_for_a_job_with_timeout(&cache->evictor.completion, job_id, 50); + unsigned new_job_id = completion_wait_for_a_job_with_timeout( + &cache->evictor.completion, job_id, 100); + bool was_signaled = new_job_id > job_id; worker_is_busy(was_signaled ? 1 : 0); job_id = new_job_id; @@ -1958,31 +1948,16 @@ static void *pgc_evict_thread(void *ptr) { if (nd_thread_signaled_to_cancel()) return NULL; - spinlock_lock(&cache->evictor.spinlock); - - size_t at_once = 10; size_t size_to_evict = 0; size_t per1000 = cache_usage_per1000(cache, &size_to_evict); - bool was_aggressive = per1000 > cache->config.aggressive_evict_per1000; - - while (size_to_evict && ((--at_once && size_to_evict && per1000 > cache->config.healthy_size_per1000) || (per1000 > cache->config.aggressive_evict_per1000))) { - if (nd_thread_signaled_to_cancel()) { - spinlock_unlock(&cache->evictor.spinlock); - return NULL; - } + bool was_critical = per1000 >= cache->config.severe_pressure_per1000; + if(size_to_evict > 0) { evict_pages(cache, 0, 0, true, false); - if(was_signaled || was_aggressive) + if (was_signaled || was_critical) mallocz_release_as_much_memory_to_the_system(); - - yield_the_processor(); - - size_to_evict = 0; - per1000 = cache_usage_per1000(cache, &size_to_evict); } - - spinlock_unlock(&cache->evictor.spinlock); } worker_unregister(); @@ -2020,7 +1995,7 @@ PGC *pgc_create(const char *name, cache->config.options = options; cache->config.additional_bytes_per_page = additional_bytes_per_page; - cache->config.stats = telemetry_enabled; + cache->config.stats = pulse_enabled; // flushing cache->config.max_flushes_inline = (max_flushes_inline == 0) ? 2 : max_flushes_inline; @@ -2035,39 +2010,45 @@ PGC *pgc_create(const char *name, cache->config.max_pages_per_inline_eviction = max_pages_per_inline_eviction; cache->config.max_skip_pages_per_inline_eviction = (max_skip_pages_per_inline_eviction < 2) ? 2 : max_skip_pages_per_inline_eviction; cache->config.severe_pressure_per1000 = 1010; // INLINE: use releasers to evict pages (up to max_pages_per_inline_eviction) - cache->config.aggressive_evict_per1000 = 990; // INLINE: use adders to evict page (up to max_pages_per_inline_eviction) - cache->config.healthy_size_per1000 = 980; // signal the eviction thread to evict immediately + cache->config.aggressive_evict_per1000 = 990; // INLINE: use adders to evict pages (up to max_pages_per_inline_eviction) + cache->config.healthy_size_per1000 = 980; // no evictions happen below this threshold cache->config.evict_low_threshold_per1000 = 970; // when evicting, bring the size down to this threshold + // the eviction thread is signaled ONLY if we run out of memory + // otherwise, it runs by itself every 100ms // use all ram and protection from out of memory cache->config.use_all_ram = dbengine_use_all_ram_for_caches; cache->config.out_of_memory_protection_bytes = dbengine_out_of_memory_protection; // partitions - cache->config.partitions = partitions == 0 ? 1ULL + get_netdata_cpus() / 2 : partitions; + if(partitions == 0) partitions = get_netdata_cpus(); + if(partitions <= 4) partitions = 4; + if(partitions > 256) partitions = 256; + cache->config.partitions = partitions; cache->index = callocz(cache->config.partitions, sizeof(struct pgc_index)); pgc_section_pages_static_aral_init(); - for(size_t part = 0; part < cache->config.partitions ; part++) + for(size_t part = 0; part < cache->config.partitions ; part++) { rw_spinlock_init(&cache->index[part].rw_spinlock); - #ifdef PGC_WITH_ARAL - if(cache->config.additional_bytes_per_page) { - char buf[100]; - snprintfz(buf, sizeof(buf), "%s", name); - cache->aral = aral_create( - buf, - sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page, - 0, - 16364, - &aral_statistics_for_pgc, - NULL, NULL, false, false); + { + char buf[100]; + snprintfz(buf, sizeof(buf), "%s", name); + cache->index[part].aral = aral_create( + buf, + sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page, + 0, + 0, + &aral_statistics_for_pgc, + NULL, + NULL, + false, + false); + } } - else - cache->aral = pgc_pages_aral; - telemetry_aral_register(cache->aral, "pgc"); + pulse_aral_register(cache->index[0].aral, "pgc"); #endif @@ -2103,11 +2084,11 @@ PGC *pgc_create(const char *name, } size_t pgc_aral_structures(void) { - return aral_structures(pgc_pages_aral); + return aral_structures_from_stats(&aral_statistics_for_pgc); } size_t pgc_aral_overhead(void) { - return aral_overhead(pgc_pages_aral); + return aral_overhead_from_stats(&aral_statistics_for_pgc); } void pgc_flush_all_hot_and_dirty_pages(PGC *cache, Word_t section) { @@ -2138,13 +2119,13 @@ void pgc_destroy(PGC *cache) { else { pointer_destroy_index(cache); -// for(size_t part = 0; part < cache->config.partitions ; part++) -// netdata_rwlock_destroy(&cache->index[part].rw_spinlock); - + for(size_t part = 0; part < cache->config.partitions ;part++) { + // netdata_rwlock_destroy(&cache->index[part].rw_spinlock); #ifdef PGC_WITH_ARAL - if(cache->config.additional_bytes_per_page) - aral_destroy(cache->aral); + aral_destroy(cache->index[part].aral); #endif + } + freez(cache->index); freez(cache); } @@ -2300,12 +2281,10 @@ void pgc_set_nominal_page_size_callback(PGC *cache, nominal_page_size_callback c } size_t pgc_get_current_cache_size(PGC *cache) { - cache_usage_per1000(cache, NULL); return __atomic_load_n(&cache->stats.current_cache_size, __ATOMIC_RELAXED); } size_t pgc_get_wanted_cache_size(PGC *cache) { - cache_usage_per1000(cache, NULL); return __atomic_load_n(&cache->stats.wanted_cache_size, __ATOMIC_RELAXED); } @@ -2397,7 +2376,7 @@ PGC_PAGE *pgc_page_get_and_acquire(PGC *cache, Word_t section, Word_t metric_id, if(page || !retry) break; - yield_the_processor(); + tinysleep(); } if(page) { diff --git a/src/database/engine/datafile.c b/src/database/engine/datafile.c index 9e39268b000c4e..aab19534614cf0 100644 --- a/src/database/engine/datafile.c +++ b/src/database/engine/datafile.c @@ -56,10 +56,11 @@ bool datafile_acquire(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS re return ret; } -void datafile_release(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason) { +void datafile_release_with_trace(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason, const char *func) { spinlock_lock(&df->users.spinlock); if(!df->users.lockers) - fatal("DBENGINE DATAFILE: cannot release a datafile that is not acquired"); + fatal("DBENGINE DATAFILE: cannot release datafile %u of tier %u - it is not acquired, called from %s() with reason %u", + df->fileno, df->tier, func, reason); df->users.lockers--; df->users.lockers_by_reason[reason]--; diff --git a/src/database/engine/datafile.h b/src/database/engine/datafile.h index 843cb8c1e65abe..9e135142197cfb 100644 --- a/src/database/engine/datafile.h +++ b/src/database/engine/datafile.h @@ -76,7 +76,8 @@ struct rrdengine_datafile { }; bool datafile_acquire(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason); -void datafile_release(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason); +void datafile_release_with_trace(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason, const char *func); +#define datafile_release(df, reason) datafile_release_with_trace(df, reason, __FUNCTION__) bool datafile_acquire_for_deletion(struct rrdengine_datafile *df, bool is_shutdown); void datafile_list_insert(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, bool having_lock); diff --git a/src/database/engine/metric.c b/src/database/engine/metric.c index 6e773bbb6fcbd5..6caeef76cfafd4 100644 --- a/src/database/engine/metric.c +++ b/src/database/engine/metric.c @@ -375,7 +375,7 @@ inline MRG *mrg_create(ssize_t partitions) { mrg->index[i].aral = aral_create(buf, sizeof(METRIC), 0, 16384, &mrg_aral_statistics, NULL, NULL, false, false); } - telemetry_aral_register(mrg->index[0].aral, "mrg"); + pulse_aral_register(mrg->index[0].aral, "mrg"); return mrg; } @@ -395,7 +395,7 @@ inline void mrg_destroy(MRG *mrg __maybe_unused) { // to delete entries, the caller needs to keep pointers to them // and delete them one by one - telemetry_aral_unregister(mrg->index[0].aral); + pulse_aral_unregister(mrg->index[0].aral); } inline METRIC *mrg_metric_add_and_acquire(MRG *mrg, MRG_ENTRY entry, bool *ret) { diff --git a/src/database/engine/page.c b/src/database/engine/page.c index 19c94077be873f..e642bf588d212a 100644 --- a/src/database/engine/page.c +++ b/src/database/engine/page.c @@ -54,18 +54,24 @@ struct pgd { // ---------------------------------------------------------------------------- // memory management -#define ARAL_TOLERANCE_TO_DEDUP 7 // deduplicate aral sizes, if the delta is below this number of bytes -#define PGD_ARAL_PARTITIONS 4 +// deduplicate aral sizes, if the delta is below this number of bytes +#define ARAL_TOLERANCE_TO_DEDUP 7 + +// max, we use as many as the cpu cores +// cannot be bigger than 256, due to struct pgd->partition (uint8_t) +#define PGD_ARAL_PARTITIONS_MAX 256 struct { + size_t partitions; + size_t sizeof_pgd; size_t sizeof_gorilla_writer_t; size_t sizeof_gorilla_buffer_32bit; - ARAL *aral_pgd[PGD_ARAL_PARTITIONS]; - ARAL *aral_gorilla_buffer[PGD_ARAL_PARTITIONS]; - ARAL *aral_gorilla_writer[PGD_ARAL_PARTITIONS]; -} pgd_alloc_globals = {}; + ARAL *aral_pgd[PGD_ARAL_PARTITIONS_MAX]; + ARAL *aral_gorilla_buffer[PGD_ARAL_PARTITIONS_MAX]; + ARAL *aral_gorilla_writer[PGD_ARAL_PARTITIONS_MAX]; +} pgd_alloc_globals = { 0 }; #if RRD_STORAGE_TIERS != 5 #error "You need to update the slots reserved for storage tiers" @@ -110,6 +116,11 @@ int aral_size_sort_compare(const void *a, const void *b) { } void pgd_init_arals(void) { + size_t partitions = get_netdata_cpus(); + if(partitions < 4) partitions = 4; + if(partitions > PGD_ARAL_PARTITIONS_MAX) partitions = PGD_ARAL_PARTITIONS_MAX; + pgd_alloc_globals.partitions = partitions; + aral_sizes_count = _countof(aral_sizes); for(size_t i = 0; i < RRD_STORAGE_TIERS ;i++) @@ -145,9 +156,9 @@ void pgd_init_arals(void) { aral_sizes[i] = 0; // allocate all the arals - arals = callocz(aral_sizes_count * PGD_ARAL_PARTITIONS, sizeof(ARAL *)); + arals = callocz(aral_sizes_count * pgd_alloc_globals.partitions, sizeof(ARAL *)); for(size_t slot = 0; slot < aral_sizes_count ; slot++) { - for(size_t partition = 0; partition < PGD_ARAL_PARTITIONS; partition++) { + for(size_t partition = 0; partition < pgd_alloc_globals.partitions; partition++) { if(partition > 0 && aral_sizes[slot] > 128) { // do not create partitions for sizes above 128 bytes @@ -169,7 +180,7 @@ void pgd_init_arals(void) { } } - for(size_t p = 0; p < PGD_ARAL_PARTITIONS ;p++) { + for(size_t p = 0; p < pgd_alloc_globals.partitions ;p++) { pgd_alloc_globals.aral_pgd[p] = pgd_get_aral_by_size_and_partition(sizeof(PGD), p); pgd_alloc_globals.aral_gorilla_writer[p] = pgd_get_aral_by_size_and_partition(sizeof(gorilla_writer_t), p); pgd_alloc_globals.aral_gorilla_buffer[p] = pgd_get_aral_by_size_and_partition(RRDENG_GORILLA_32BIT_BUFFER_SIZE, p); @@ -184,11 +195,11 @@ void pgd_init_arals(void) { pgd_alloc_globals.sizeof_gorilla_writer_t = aral_actual_element_size(pgd_alloc_globals.aral_gorilla_writer[0]); pgd_alloc_globals.sizeof_gorilla_buffer_32bit = aral_actual_element_size(pgd_alloc_globals.aral_gorilla_buffer[0]); - telemetry_aral_register(pgd_alloc_globals.aral_pgd[0], "pgd"); + pulse_aral_register(pgd_alloc_globals.aral_pgd[0], "pgd"); } static ARAL *pgd_get_aral_by_size_and_partition(size_t size, size_t partition) { - internal_fatal(partition >= PGD_ARAL_PARTITIONS, "Wrong partition %zu", partition); + internal_fatal(partition >= pgd_alloc_globals.partitions, "Wrong partition %zu", partition); size_t slot; @@ -218,17 +229,17 @@ static ARAL *pgd_get_aral_by_size_and_partition(size_t size, size_t partition) { } static inline gorilla_writer_t *pgd_gorilla_writer_alloc(size_t partition) { - internal_fatal(partition >= PGD_ARAL_PARTITIONS, "invalid gorilla writer partition %zu", partition); + internal_fatal(partition >= pgd_alloc_globals.partitions, "invalid gorilla writer partition %zu", partition); return aral_mallocz_marked(pgd_alloc_globals.aral_gorilla_writer[partition]); } static inline gorilla_buffer_t *pgd_gorilla_buffer_alloc(size_t partition) { - internal_fatal(partition >= PGD_ARAL_PARTITIONS, "invalid gorilla buffer partition %zu", partition); + internal_fatal(partition >= pgd_alloc_globals.partitions, "invalid gorilla buffer partition %zu", partition); return aral_mallocz_marked(pgd_alloc_globals.aral_gorilla_buffer[partition]); } static inline PGD *pgd_alloc(bool for_collector) { - size_t partition = gettid_cached() % PGD_ARAL_PARTITIONS; + size_t partition = gettid_cached() % pgd_alloc_globals.partitions; PGD *pgd; if(for_collector) @@ -312,7 +323,7 @@ PGD *pgd_create(uint8_t type, uint32_t slots) { // allocate new gorilla buffer gorilla_buffer_t *gbuf = pgd_gorilla_buffer_alloc(pg->partition); memset(gbuf, 0, RRDENG_GORILLA_32BIT_BUFFER_SIZE); - telemetry_gorilla_hot_buffer_added(); + pulse_gorilla_hot_buffer_added(); *pg->gorilla.writer = gorilla_writer_init(gbuf, RRDENG_GORILLA_32BIT_BUFFER_SLOTS); pg->gorilla.num_buffers = 1; @@ -381,7 +392,7 @@ void pgd_free(PGD *pg) { if (!pg || pg == PGD_EMPTY) return; - internal_fatal(pg->partition >= PGD_ARAL_PARTITIONS, + internal_fatal(pg->partition >= pgd_alloc_globals.partitions, "PGD partition is invalid %u", pg->partition); switch (pg->type) @@ -457,7 +468,7 @@ static void pgd_aral_unmark(PGD *pg) { !(pg->options & PAGE_OPTION_ARAL_MARKED)) return; - internal_fatal(pg->partition >= PGD_ARAL_PARTITIONS, + internal_fatal(pg->partition >= pgd_alloc_globals.partitions, "PGD partition is invalid %u", pg->partition); switch (pg->type) @@ -650,7 +661,7 @@ uint32_t pgd_disk_footprint(PGD *pg) size = pg->gorilla.num_buffers * RRDENG_GORILLA_32BIT_BUFFER_SIZE; if (pg->states & PGD_STATE_CREATED_FROM_COLLECTOR) - telemetry_gorilla_tier0_page_flush( + pulse_gorilla_tier0_page_flush( gorilla_writer_actual_nbytes(pg->gorilla.writer), gorilla_writer_optimal_nbytes(pg->gorilla.writer), tier_page_size[0]); @@ -778,7 +789,7 @@ size_t pgd_append_point(PGD *pg, gorilla_writer_add_buffer(pg->gorilla.writer, new_buffer, RRDENG_GORILLA_32BIT_BUFFER_SLOTS); pg->gorilla.num_buffers += 1; - telemetry_gorilla_hot_buffer_added(); + pulse_gorilla_hot_buffer_added(); ok = gorilla_writer_write(pg->gorilla.writer, t); internal_fatal(ok == false, "Failed to writer value in newly allocated gorilla buffer."); diff --git a/src/database/engine/pagecache.c b/src/database/engine/pagecache.c index 98de9a26ce2574..d440244dbee317 100644 --- a/src/database/engine/pagecache.c +++ b/src/database/engine/pagecache.c @@ -1033,49 +1033,32 @@ void pgc_open_add_hot_page(Word_t section, Word_t metric_id, time_t start_time_s size_t dynamic_open_cache_size(void) { size_t main_wanted_cache_size = pgc_get_wanted_cache_size(main_cache); - size_t target_size = main_wanted_cache_size / 100 * 5; // 5% - -// static bool query_current_size = true; -// if(query_current_size) { -// size_t main_current_cache_size = pgc_get_current_cache_size(main_cache); -// -// size_t main_free_cache_size = (main_wanted_cache_size > main_current_cache_size) ? -// main_wanted_cache_size - main_current_cache_size : 0; -// -// if(main_free_cache_size > target_size) -// target_size = main_free_cache_size; -// else -// query_current_size = false; -// } + size_t target_size = main_wanted_cache_size / 100 * 10; // 10% if(target_size < 2 * 1024 * 1024) target_size = 2 * 1024 * 1024; - return target_size; + size_t main_current_cache_size = pgc_get_current_cache_size(main_cache); + + size_t main_free_cache_size = (main_wanted_cache_size > main_current_cache_size) ? + main_wanted_cache_size - main_current_cache_size : 0; + + return target_size + main_free_cache_size; } size_t dynamic_extent_cache_size(void) { size_t main_wanted_cache_size = pgc_get_wanted_cache_size(main_cache); - size_t target_size = main_wanted_cache_size / 100 * 10; // 10% -// static bool query_current_size = true; -// if(query_current_size) { -// size_t main_current_cache_size = pgc_get_current_cache_size(main_cache); -// -// size_t main_free_cache_size = (main_wanted_cache_size > main_current_cache_size) ? -// main_wanted_cache_size - main_current_cache_size : 0; -// -// if(main_free_cache_size > target_size) -// target_size = main_free_cache_size; -// else -// query_current_size = false; -// } - if(target_size < 5 * 1024 * 1024) target_size = 5 * 1024 * 1024; - return target_size; + size_t main_current_cache_size = pgc_get_current_cache_size(main_cache); + + size_t main_free_cache_size = (main_wanted_cache_size > main_current_cache_size) ? + main_wanted_cache_size - main_current_cache_size : 0; + + return target_size + main_free_cache_size; } size_t pgc_main_nominal_page_size(void *data) { diff --git a/src/database/engine/pdc.c b/src/database/engine/pdc.c index 1354ffd421400c..c5d46c66477290 100644 --- a/src/database/engine/pdc.c +++ b/src/database/engine/pdc.c @@ -58,7 +58,7 @@ void pdc_init(void) { NULL, NULL, false, false ); - telemetry_aral_register(pdc_globals.pdc.ar, "pdc"); + pulse_aral_register(pdc_globals.pdc.ar, "pdc"); } PDC *pdc_get(void) { @@ -87,7 +87,7 @@ void page_details_init(void) { NULL, NULL, NULL, false, false ); - telemetry_aral_register(pdc_globals.pd.ar, "pd"); + pulse_aral_register(pdc_globals.pd.ar, "pd"); } struct page_details *page_details_get(void) { @@ -116,7 +116,7 @@ void epdl_init(void) { NULL, NULL, NULL, false, false ); - telemetry_aral_register(pdc_globals.epdl.ar, "epdl"); + pulse_aral_register(pdc_globals.epdl.ar, "epdl"); } static EPDL *epdl_get(void) { @@ -146,7 +146,7 @@ void deol_init(void) { NULL, NULL, false, false ); - telemetry_aral_register(pdc_globals.deol.ar, "deol"); + pulse_aral_register(pdc_globals.deol.ar, "deol"); } static DEOL *deol_get(void) { @@ -182,7 +182,7 @@ static struct { } extent_buffer_globals = { .protected = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .available_items = NULL, .available = 0, }, diff --git a/src/database/engine/rrdengine.c b/src/database/engine/rrdengine.c index 80e9b898512bd8..34f59dcb11bf7e 100644 --- a/src/database/engine/rrdengine.c +++ b/src/database/engine/rrdengine.c @@ -95,7 +95,7 @@ struct rrdeng_main { .cmd_queue = { .unsafe = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, }, } }; @@ -149,7 +149,7 @@ static void work_request_init(void) { NULL, NULL, false, false ); - telemetry_aral_register(rrdeng_main.work_cmd.ar, "workers"); + pulse_aral_register(rrdeng_main.work_cmd.ar, "workers"); } enum LIBUV_WORKERS_STATUS { @@ -269,7 +269,7 @@ void page_descriptors_init(void) { NULL, NULL, NULL, false, false); - telemetry_aral_register(rrdeng_main.xt_io_descr.ar, "descriptors"); + pulse_aral_register(rrdeng_main.xt_io_descr.ar, "descriptors"); } struct page_descr_with_data *page_descriptor_get(void) { @@ -295,7 +295,7 @@ static void extent_io_descriptor_init(void) { NULL, NULL, false, false ); - telemetry_aral_register(rrdeng_main.xt_io_descr.ar, "extent io"); + pulse_aral_register(rrdeng_main.xt_io_descr.ar, "extent io"); } static struct extent_io_descriptor *extent_io_descriptor_get(void) { @@ -320,7 +320,7 @@ void rrdeng_query_handle_init(void) { NULL, NULL, NULL, false, false); - telemetry_aral_register(rrdeng_main.handles.ar, "query handles"); + pulse_aral_register(rrdeng_main.handles.ar, "query handles"); } struct rrdeng_query_handle *rrdeng_query_handle_get(void) { @@ -348,7 +348,7 @@ static struct { } atomics; } wal_globals = { .protected = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .available_items = NULL, .available = 0, }, @@ -442,7 +442,7 @@ static void rrdeng_cmd_queue_init(void) { NULL, NULL, NULL, false, false); - telemetry_aral_register(rrdeng_main.cmd_queue.ar, "opcodes"); + pulse_aral_register(rrdeng_main.cmd_queue.ar, "opcodes"); } static inline STORAGE_PRIORITY rrdeng_enq_cmd_map_opcode_to_priority(enum rrdeng_opcode opcode, STORAGE_PRIORITY priority) { @@ -1745,7 +1745,7 @@ static void dbengine_initialize_structures(void) { bool rrdeng_dbengine_spawn(struct rrdengine_instance *ctx __maybe_unused) { static bool spawned = false; - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; spinlock_lock(&spinlock); diff --git a/src/database/engine/rrdengineapi.c b/src/database/engine/rrdengineapi.c index cdcf17326b9b64..69976934f6791a 100755 --- a/src/database/engine/rrdengineapi.c +++ b/src/database/engine/rrdengineapi.c @@ -692,7 +692,7 @@ void rrdeng_store_metric_change_collection_frequency(STORAGE_COLLECT_HANDLE *sch // query ops #ifdef NETDATA_INTERNAL_CHECKS -SPINLOCK global_query_handle_spinlock = NETDATA_SPINLOCK_INITIALIZER; +SPINLOCK global_query_handle_spinlock = SPINLOCK_INITIALIZER; static struct rrdeng_query_handle *global_query_handle_ll = NULL; static void register_query_handle(struct rrdeng_query_handle *handle) { handle->query_pid = gettid_cached(); diff --git a/src/database/rrdlabels.c b/src/database/rrdlabels.c index 585b9826464a32..f16513520ae1a2 100644 --- a/src/database/rrdlabels.c +++ b/src/database/rrdlabels.c @@ -10,8 +10,7 @@ struct { SPINLOCK spinlock; } global_labels = { .JudyHS = (Pvoid_t) NULL, - .spinlock = NETDATA_SPINLOCK_INITIALIZER -}; + .spinlock = SPINLOCK_INITIALIZER}; typedef struct label_registry_idx { STRING *key; diff --git a/src/database/rrdset.c b/src/database/rrdset.c index ea5e50b00b2dd0..448e836650c10a 100644 --- a/src/database/rrdset.c +++ b/src/database/rrdset.c @@ -1293,7 +1293,7 @@ void rrddim_store_metric(RRDDIM *rd, usec_t point_end_time_ut, NETDATA_DOUBLE n, } void store_metric_collection_completed() { - telemetry_queries_rrdset_collection_completed(rrdset_done_statistics_points_stored_per_tier); + pulse_queries_rrdset_collection_completed(rrdset_done_statistics_points_stored_per_tier); } // caching of dimensions rrdset_done() and rrdset_done_interpolate() loop through diff --git a/src/database/sqlite/sqlite_functions.c b/src/database/sqlite/sqlite_functions.c index d3ca06dc203559..b55113a2d450eb 100644 --- a/src/database/sqlite/sqlite_functions.c +++ b/src/database/sqlite/sqlite_functions.c @@ -13,7 +13,7 @@ SQLITE_API int sqlite3_exec_monitored( char **errmsg /* Error msg written here */ ) { int rc = sqlite3_exec(db, sql, callback, data, errmsg); - telemetry_sqlite3_query_completed(rc == SQLITE_OK, rc == SQLITE_BUSY, rc == SQLITE_LOCKED); + pulse_sqlite3_query_completed(rc == SQLITE_OK, rc == SQLITE_BUSY, rc == SQLITE_LOCKED); return rc; } @@ -25,14 +25,14 @@ SQLITE_API int sqlite3_step_monitored(sqlite3_stmt *stmt) { rc = sqlite3_step(stmt); switch (rc) { case SQLITE_DONE: - telemetry_sqlite3_query_completed(1, 0, 0); + pulse_sqlite3_query_completed(1, 0, 0); break; case SQLITE_ROW: - telemetry_sqlite3_row_completed(); + pulse_sqlite3_row_completed(); break; case SQLITE_BUSY: case SQLITE_LOCKED: - telemetry_sqlite3_query_completed(false, rc == SQLITE_BUSY, rc == SQLITE_LOCKED); + pulse_sqlite3_query_completed(false, rc == SQLITE_BUSY, rc == SQLITE_LOCKED); usleep(SQLITE_INSERT_DELAY * USEC_PER_MS); continue; default: @@ -402,7 +402,7 @@ int sqlite_library_init(void) return (SQLITE_OK != rc); } -SPINLOCK sqlite_spinlock = NETDATA_SPINLOCK_INITIALIZER; +SPINLOCK sqlite_spinlock = SPINLOCK_INITIALIZER; void sqlite_library_shutdown(void) { diff --git a/src/exporting/process_data.c b/src/exporting/process_data.c index aa60eab5faed39..c1e78f65e6bf16 100644 --- a/src/exporting/process_data.c +++ b/src/exporting/process_data.c @@ -139,7 +139,7 @@ NETDATA_DOUBLE exporting_calculate_value_from_stored_data( counter += sp.count; } storage_engine_query_finalize(&handle); - telemetry_queries_exporters_query_completed(points_read); + pulse_queries_exporters_query_completed(points_read); if (unlikely(!counter)) { netdata_log_debug( diff --git a/src/exporting/send_internal_metrics.c b/src/exporting/send_internal_metrics.c index f447e6b5185e00..bacd47361ed01e 100644 --- a/src/exporting/send_internal_metrics.c +++ b/src/exporting/send_internal_metrics.c @@ -11,7 +11,7 @@ */ void create_main_rusage_chart(RRDSET **st_rusage, RRDDIM **rd_user, RRDDIM **rd_system) { - if (!telemetry_enabled) + if (!pulse_enabled) return; if (*st_rusage && *rd_user && *rd_system) @@ -44,7 +44,7 @@ void create_main_rusage_chart(RRDSET **st_rusage, RRDDIM **rd_user, RRDDIM **rd_ */ void send_main_rusage(RRDSET *st_rusage, RRDDIM *rd_user, RRDDIM *rd_system) { - if (!telemetry_enabled) + if (!pulse_enabled) return; struct rusage thread; @@ -65,7 +65,7 @@ void send_main_rusage(RRDSET *st_rusage, RRDDIM *rd_user, RRDDIM *rd_system) */ void send_internal_metrics(struct instance *instance) { - if (!telemetry_enabled) + if (!pulse_enabled) return; struct stats *stats = &instance->stats; diff --git a/src/health/health.c b/src/health/health.c index 78559d7f47fbb8..ab3b8af190a19a 100644 --- a/src/health/health.c +++ b/src/health/health.c @@ -5,7 +5,7 @@ struct health_plugin_globals health_globals = { .initialization = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .done = false, }, .config = { diff --git a/src/health/rrdcalc.c b/src/health/rrdcalc.c index e5a26db073fe43..1a7647eca8d6e3 100644 --- a/src/health/rrdcalc.c +++ b/src/health/rrdcalc.c @@ -329,15 +329,18 @@ static void rrdcalc_rrdhost_insert_callback(const DICTIONARY_ITEM *item __maybe_ if(!rc->config.units) rc->config.units = string_dup(st->units); - if(rc->config.update_every < rc->rrdset->update_every) { - netdata_log_info( - "HEALTH: alert '%s.%s' has update every %d, less than chart update every %d. " - "Setting alarm update frequency to %d.", - string2str(st->id), string2str(rc->config.name), - rc->config.update_every, rc->rrdset->update_every, rc->rrdset->update_every); - - rc->config.update_every = st->update_every; - } + // the following interferes with replication, changing the alert frequency to unexpected values + // let's respect user configuration, so we disable it + +// if(rc->config.update_every < rc->rrdset->update_every) { +// netdata_log_info( +// "HEALTH: alert '%s.%s' has update every %d, less than chart update every %d. " +// "Setting alarm update frequency to %d.", +// string2str(st->id), string2str(rc->config.name), +// rc->config.update_every, rc->rrdset->update_every, rc->rrdset->update_every); +// +// rc->config.update_every = st->update_every; +// } rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->config.name, &rc->next_event_id, &rc->config.hash_id); diff --git a/src/libnetdata/aral/aral.c b/src/libnetdata/aral/aral.c index 3331cd4ede3909..3d7c76fb1ac8c1 100644 --- a/src/libnetdata/aral/aral.c +++ b/src/libnetdata/aral/aral.c @@ -11,18 +11,17 @@ #define TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS #endif -// max file size +// max mapped file size #define ARAL_MAX_PAGE_SIZE_MMAP (1ULL * 1024 * 1024 * 1024) // max malloc size // optimal at current versions of libc is up to 256k // ideal to have the same overhead as libc is 4k -#define ARAL_MAX_PAGE_SIZE_MALLOC (256ULL * 1024) +#define ARAL_MAX_PAGE_SIZE_MALLOC (1ULL * 1024 * 1024) -// we don't need alignof(max_align_t) for normal C structures -// alignof(uintptr_r) is sufficient for our use cases -// #define SYSTEM_REQUIRED_ALIGNMENT (alignof(max_align_t)) -#define SYSTEM_REQUIRED_ALIGNMENT (alignof(uintptr_t)) +// in malloc mode, when the page is bigger than this +// use anonymous private mmap pages +#define ARAL_MMAP_PAGES_ABOVE (32ULL * 1024) typedef struct aral_free { size_t size; @@ -31,6 +30,8 @@ typedef struct aral_free { typedef struct aral_page { bool marked; + bool started_marked; + bool mapped; uint32_t size; // the allocation size of the page const char *filename; uint8_t *data; @@ -62,6 +63,7 @@ struct aral_ops { struct { alignas(64) size_t allocators; // the number of threads currently trying to allocate memory alignas(64) size_t deallocators; // the number of threads currently trying to deallocate memory + alignas(64) bool last_allocated_or_deallocated; // stability detector, true when was last allocated } atomic; struct { @@ -80,8 +82,7 @@ struct aral { size_t element_size; // calculated to take into account ARAL overheads size_t max_allocation_size; // calculated in bytes - size_t max_page_elements; // calculated - size_t page_ptr_offset; // calculated + size_t element_ptr_offset; // calculated size_t system_page_size; // calculated size_t initial_page_elements; @@ -161,30 +162,19 @@ struct aral_statistics *aral_get_statistics(ARAL *ar) { return ar->stats; } -static inline size_t memory_alignment(size_t size, size_t alignment) { - // return (size + alignment - 1) & ~(alignment - 1); // assumees alignment is power of 2 - return ((size + alignment - 1) / alignment) * alignment; -} - -static size_t aral_align_alloc_size(ARAL *ar, uint64_t size) { - size = memory_alignment(size, ar->config.system_page_size); - - if(size % ar->config.element_size) - size -= size % ar->config.element_size; - - return size; -} - -static inline void aral_lock(ARAL *ar) { +static inline void aral_lock_with_trace(ARAL *ar, const char *func) { if(likely(!(ar->config.options & ARAL_LOCKLESS))) - spinlock_lock(&ar->aral_lock.spinlock); + spinlock_lock_with_trace(&ar->aral_lock.spinlock, func); } -static inline void aral_unlock(ARAL *ar) { +static inline void aral_unlock_with_trace(ARAL *ar, const char *func) { if(likely(!(ar->config.options & ARAL_LOCKLESS))) - spinlock_unlock(&ar->aral_lock.spinlock); + spinlock_unlock_with_trace(&ar->aral_lock.spinlock, func); } +#define aral_lock(ar) aral_lock_with_trace(ar, __FUNCTION__) +#define aral_unlock(ar) aral_unlock_with_trace(ar, __FUNCTION__) + static inline void aral_page_free_lock(ARAL *ar, ARAL_PAGE *page) { if(likely(!(ar->config.options & ARAL_LOCKLESS))) spinlock_lock(&page->free.spinlock); @@ -318,13 +308,13 @@ static inline ARAL_PAGE *find_page_with_allocation_internal_check(ARAL *ar, void } #endif -// ---------------------------------------------------------------------------- +// -------------------------------------------------------------------------------------------------------------------- // Tagging the pointer with the 'marked' flag // Retrieving the pointer and the 'marked' flag static ARAL_PAGE *aral_get_page_pointer_after_element___do_NOT_have_aral_lock(ARAL *ar, void *ptr, bool *marked) { uint8_t *data = ptr; - uintptr_t *page_ptr = (uintptr_t *)&data[ar->config.page_ptr_offset]; + uintptr_t *page_ptr = (uintptr_t *)&data[ar->config.element_ptr_offset]; uintptr_t tagged_page = __atomic_load_n(page_ptr, __ATOMIC_ACQUIRE); // Atomically load the tagged pointer *marked = (tagged_page & 1) != 0; // Extract the LSB as the 'marked' flag ARAL_PAGE *page = (ARAL_PAGE *)(tagged_page & ~1); // Mask out the LSB to get the original pointer @@ -358,13 +348,13 @@ static ARAL_PAGE *aral_get_page_pointer_after_element___do_NOT_have_aral_lock(AR static void aral_set_page_pointer_after_element___do_NOT_have_aral_lock(ARAL *ar, void *page, void *ptr, bool marked) { uint8_t *data = ptr; - uintptr_t *page_ptr = (uintptr_t *)&data[ar->config.page_ptr_offset]; + uintptr_t *page_ptr = (uintptr_t *)&data[ar->config.element_ptr_offset]; uintptr_t tagged_page = (uintptr_t)page; // Cast the pointer to an integer if (marked) tagged_page |= 1; // Set the LSB to 1 if 'marked' is true __atomic_store_n(page_ptr, tagged_page, __ATOMIC_RELEASE); // Atomically store the tagged pointer } -// ---------------------------------------------------------------------------- +// -------------------------------------------------------------------------------------------------------------------- // check a free slot #ifdef NETDATA_INTERNAL_CHECKS @@ -381,57 +371,168 @@ static inline void aral_free_validate_internal_check(ARAL *ar, ARAL_FREE *fr) { #define aral_free_validate_internal_check(ar, fr) debug_dummy() #endif -// ---------------------------------------------------------------------------- +// -------------------------------------------------------------------------------------------------------------------- +// page size management + +static inline size_t memory_alignment(size_t size, size_t alignment) { + // return (size + alignment - 1) & ~(alignment - 1); // assumees alignment is power of 2 + return ((size + alignment - 1) / alignment) * alignment; +} + +static size_t aral_get_system_page_size(void) { + long int page_size = sysconf(_SC_PAGE_SIZE); + if (unlikely(page_size <= 4096)) + return 4096; + else + return page_size; +} + +// we don't need alignof(max_align_t) for normal C structures +// alignof(uintptr_r) is sufficient for our use cases +// #define SYSTEM_REQUIRED_ALIGNMENT (alignof(max_align_t)) +#define SYSTEM_REQUIRED_ALIGNMENT (alignof(uintptr_t)) + +static size_t aral_element_slot_size(size_t requested_element_size, bool usable) { + // we need to add a page pointer after the element + // so, first align the element size to the pointer size + size_t element_size = memory_alignment(requested_element_size, sizeof(uintptr_t)); + + // then add the size of a pointer to it + element_size += sizeof(uintptr_t); + + // make sure it is at least what we need for an ARAL_FREE slot + if (element_size < sizeof(ARAL_FREE)) + element_size = sizeof(ARAL_FREE); + + // and finally align it to the natural alignment + element_size = memory_alignment(element_size, SYSTEM_REQUIRED_ALIGNMENT); + + if(usable) + return element_size - sizeof(uintptr_t); + + return element_size; +} + +size_t aral_optimal_malloc_page_size(void) { + return ARAL_MAX_PAGE_SIZE_MALLOC; +} -size_t aral_next_allocation_size___adders_lock_needed(ARAL *ar, bool marked) { +static size_t aral_elements_in_page_size(ARAL *ar, size_t page_size) { + if(ar->config.mmap.enabled) + return page_size / ar->config.element_size; + + size_t aral_page_size = memory_alignment(sizeof(ARAL_PAGE), SYSTEM_REQUIRED_ALIGNMENT); + size_t remaining = page_size - aral_page_size; + return remaining / ar->config.element_size; +} + +static size_t aral_next_allocation_size___adders_lock_needed(ARAL *ar, bool marked) { size_t idx = mark_to_idx(marked); size_t size = ar->ops[idx].adders.allocation_size; - if(size > ar->config.max_allocation_size) - size = ar->config.max_allocation_size; - else - ar->ops[idx].adders.allocation_size = aral_align_alloc_size(ar, (uint64_t)ar->ops[idx].adders.allocation_size * 2); + bool last_allocated = __atomic_load_n(&ar->ops[idx].atomic.last_allocated_or_deallocated, __ATOMIC_RELAXED); + if(last_allocated) { + size *= 2; + if(size > ar->config.max_allocation_size) + size = ar->config.max_allocation_size; + ar->ops[idx].adders.allocation_size = size; + } + + if(!ar->config.mmap.enabled && size < ARAL_MMAP_PAGES_ABOVE) { + // when doing malloc, don't allocate entire pages, but only what needed + size = + aral_elements_in_page_size(ar, size) * ar->config.element_size + + memory_alignment(sizeof(ARAL_PAGE), SYSTEM_REQUIRED_ALIGNMENT); + } + + __atomic_store_n(&ar->ops[idx].atomic.last_allocated_or_deallocated, true, __ATOMIC_RELAXED); return size; } -static ARAL_PAGE *aral_create_page___no_lock_needed(ARAL *ar, size_t size TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) { - ARAL_PAGE *page = callocz(1, sizeof(ARAL_PAGE)); - spinlock_init(&page->free.spinlock); - page->size = size; - page->max_elements = page->size / ar->config.element_size; - page->aral_lock.free_elements = page->max_elements; - - __atomic_add_fetch(&ar->stats->structures.allocations, 1, __ATOMIC_RELAXED); - __atomic_add_fetch(&ar->stats->structures.allocated_bytes, sizeof(ARAL_PAGE), __ATOMIC_RELAXED); +// -------------------------------------------------------------------------------------------------------------------- - if(unlikely(ar->config.mmap.enabled)) { +static ARAL_PAGE *aral_create_page___no_lock_needed(ARAL *ar, size_t size TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) { + size_t data_size, structures_size; + ARAL_PAGE *page; + if(ar->config.mmap.enabled) { + page = callocz(1, sizeof(ARAL_PAGE)); ar->aral_lock.file_number++; + char filename[FILENAME_MAX + 1]; snprintfz(filename, FILENAME_MAX, "%s/array_alloc.mmap/%s.%zu", *ar->config.mmap.cache_dir, ar->config.mmap.filename, ar->aral_lock.file_number); page->filename = strdupz(filename); - page->data = netdata_mmap(page->filename, page->size, MAP_SHARED, 0, false, NULL); + page->mapped = true; + + page->data = netdata_mmap(page->filename, size, MAP_SHARED, 0, false, NULL); if (unlikely(!page->data)) - fatal("ARAL: '%s' cannot allocate aral buffer of size %u on filename '%s'", - ar->config.name, page->size, page->filename); + fatal("ARAL: '%s' cannot allocate aral buffer of size %zu on filename '%s'", + ar->config.name, size, page->filename); + __atomic_add_fetch(&ar->stats->mmap.allocations, 1, __ATOMIC_RELAXED); - __atomic_add_fetch(&ar->stats->mmap.allocated_bytes, page->size, __ATOMIC_RELAXED); + __atomic_add_fetch(&ar->stats->mmap.allocated_bytes, size, __ATOMIC_RELAXED); + data_size = size; + structures_size = sizeof(ARAL_PAGE); } - else { #ifdef NETDATA_TRACE_ALLOCATIONS - page->data = mallocz_int(page->size TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS); + else { + page = callocz(1, sizeof(ARAL_PAGE)); + page->data = mallocz_int(size TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS); + page->mapped = false; + __atomic_add_fetch(&ar->stats->malloc.allocations, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&ar->stats->malloc.allocated_bytes, size, __ATOMIC_RELAXED); + } #else - page->data = mallocz(page->size); + else { + size_t ARAL_PAGE_size = memory_alignment(sizeof(ARAL_PAGE), SYSTEM_REQUIRED_ALIGNMENT); + size_t max_elements = aral_elements_in_page_size(ar, size); + data_size = max_elements * ar->config.element_size; + structures_size = size - data_size; + + if (size >= ARAL_MMAP_PAGES_ABOVE) { + bool mapped; + uint8_t *ptr = netdata_mmap(NULL, size, MAP_PRIVATE, 1, false, NULL); + if (ptr) { + mapped = true; + __atomic_add_fetch(&ar->stats->mmap.allocations, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&ar->stats->mmap.allocated_bytes, data_size, __ATOMIC_RELAXED); + } + else { + ptr = mallocz(size); + mapped = false; + __atomic_add_fetch(&ar->stats->malloc.allocations, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&ar->stats->malloc.allocated_bytes, data_size, __ATOMIC_RELAXED); + } + page = (ARAL_PAGE *)ptr; + memset(page, 0, ARAL_PAGE_size); + page->data = &ptr[ARAL_PAGE_size]; + page->mapped = mapped; + } + else { + uint8_t *ptr = mallocz(size); + page = (ARAL_PAGE *)ptr; + memset(page, 0, ARAL_PAGE_size); + page->data = &ptr[ARAL_PAGE_size]; + page->mapped = false; + + __atomic_add_fetch(&ar->stats->malloc.allocations, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&ar->stats->malloc.allocated_bytes, data_size, __ATOMIC_RELAXED); + } + } #endif - __atomic_add_fetch(&ar->stats->malloc.allocations, 1, __ATOMIC_RELAXED); - __atomic_add_fetch(&ar->stats->malloc.allocated_bytes, page->size, __ATOMIC_RELAXED); - } + spinlock_init(&page->free.spinlock); + page->size = size; + page->max_elements = aral_elements_in_page_size(ar, page->size); + page->aral_lock.free_elements = page->max_elements; + + __atomic_add_fetch(&ar->stats->structures.allocations, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&ar->stats->structures.allocated_bytes, structures_size, __ATOMIC_RELAXED); // link the free space to its page ARAL_FREE *fr = (ARAL_FREE *)page->data; - fr->size = page->size; + fr->size = data_size; fr->next = NULL; page->free.list = fr; @@ -441,33 +542,55 @@ static ARAL_PAGE *aral_create_page___no_lock_needed(ARAL *ar, size_t size TRACE_ } void aral_del_page___no_lock_needed(ARAL *ar, ARAL_PAGE *page TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) { + size_t idx = mark_to_idx(page->started_marked); + __atomic_store_n(&ar->ops[idx].atomic.last_allocated_or_deallocated, true, __ATOMIC_RELAXED); + + size_t data_size, structures_size; // free it if (ar->config.mmap.enabled) { + data_size = page->size; + structures_size = sizeof(ARAL_PAGE); + + __atomic_sub_fetch(&ar->stats->mmap.allocations, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&ar->stats->mmap.allocated_bytes, page->size, __ATOMIC_RELAXED); + netdata_munmap(page->data, page->size); if (unlikely(unlink(page->filename) == 1)) netdata_log_error("Cannot delete file '%s'", page->filename); freez((void *)page->filename); - - __atomic_sub_fetch(&ar->stats->mmap.allocations, 1, __ATOMIC_RELAXED); - __atomic_sub_fetch(&ar->stats->mmap.allocated_bytes, page->size, __ATOMIC_RELAXED); + freez(page); } else { #ifdef NETDATA_TRACE_ALLOCATIONS + __atomic_sub_fetch(&ar->stats->malloc.allocations, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&ar->stats->malloc.allocated_bytes, page->size - sizeof(ARAL_PAGE), __ATOMIC_RELAXED); + freez_int(page->data TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS); + freez(page); #else - freez(page->data); + data_size = page->max_elements * ar->config.element_size; + structures_size = page->size - data_size; + + if(page->mapped) { + __atomic_sub_fetch(&ar->stats->mmap.allocations, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&ar->stats->mmap.allocated_bytes, data_size, __ATOMIC_RELAXED); + + netdata_munmap(page, page->size); + } + else { + __atomic_sub_fetch(&ar->stats->malloc.allocations, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&ar->stats->malloc.allocated_bytes, data_size, __ATOMIC_RELAXED); + + freez(page); + } #endif - __atomic_sub_fetch(&ar->stats->malloc.allocations, 1, __ATOMIC_RELAXED); - __atomic_sub_fetch(&ar->stats->malloc.allocated_bytes, page->size, __ATOMIC_RELAXED); } - freez(page); - __atomic_sub_fetch(&ar->stats->structures.allocations, 1, __ATOMIC_RELAXED); - __atomic_sub_fetch(&ar->stats->structures.allocated_bytes, sizeof(ARAL_PAGE), __ATOMIC_RELAXED); + __atomic_sub_fetch(&ar->stats->structures.allocated_bytes, structures_size, __ATOMIC_RELAXED); } static inline ARAL_PAGE *aral_get_first_page_with_a_free_slot(ARAL *ar, bool marked TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) { @@ -502,7 +625,7 @@ static inline ARAL_PAGE *aral_get_first_page_with_a_free_slot(ARAL *ar, bool mar if(ar->ops[idx].adders.allocating_elements + threads_currently_deallocating < threads_currently_allocating) { can_add = true; page_allocation_size = aral_next_allocation_size___adders_lock_needed(ar, marked); - ar->ops[idx].adders.allocating_elements += page_allocation_size / ar->config.element_size; + ar->ops[idx].adders.allocating_elements += aral_elements_in_page_size(ar, page_allocation_size); } aral_adders_unlock(ar, marked); } @@ -510,7 +633,7 @@ static inline ARAL_PAGE *aral_get_first_page_with_a_free_slot(ARAL *ar, bool mar if(can_add) { page = aral_create_page___no_lock_needed(ar, page_allocation_size TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS); - page->marked = marked; + page->marked = page->started_marked = marked; aral_lock(ar); @@ -521,7 +644,7 @@ static inline ARAL_PAGE *aral_get_first_page_with_a_free_slot(ARAL *ar, bool mar //#endif aral_adders_lock(ar, marked); - ar->ops[idx].adders.allocating_elements -= page_allocation_size / ar->config.element_size; + ar->ops[idx].adders.allocating_elements -= aral_elements_in_page_size(ar, page_allocation_size); aral_adders_unlock(ar, marked); // we have a page that is all empty @@ -836,38 +959,6 @@ size_t aral_actual_element_size(ARAL *ar) { return ar->config.element_size; } -static size_t aral_allocation_slot_size(size_t requested_element_size, bool usable) { - // we need to add a page pointer after the element - // so, first align the element size to the pointer size - size_t element_size = memory_alignment(requested_element_size, sizeof(uintptr_t)); - - // then add the size of a pointer to it - element_size += sizeof(uintptr_t); - - // make sure it is at least what we need for an ARAL_FREE slot - if (element_size < sizeof(ARAL_FREE)) - element_size = sizeof(ARAL_FREE); - - // and finally align it to the natural alignment - element_size = memory_alignment(element_size, SYSTEM_REQUIRED_ALIGNMENT); - - if(usable) - return element_size - sizeof(uintptr_t); - - return element_size; -} - -size_t aral_optimal_page_size(void) { - return ARAL_MAX_PAGE_SIZE_MALLOC; -} - -static void optimal_max_page_size(ARAL *ar) { - if(ar->config.requested_max_page_size) - return; - - ar->config.requested_max_page_size = aral_optimal_page_size(); -} - ARAL *aral_create(const char *name, size_t element_size, size_t initial_page_elements, size_t max_page_size, struct aral_statistics *stats, const char *filename, const char **cache_dir, bool mmap, bool lockless) { ARAL *ar = callocz(1, sizeof(ARAL)); @@ -892,19 +983,22 @@ ARAL *aral_create(const char *name, size_t element_size, size_t initial_page_ele ar->config.options |= ARAL_ALLOCATED_STATS; } - long int page_size = sysconf(_SC_PAGE_SIZE); - if (unlikely(page_size == -1)) - ar->config.system_page_size = 4096; - else - ar->config.system_page_size = page_size; + // ---------------------------------------------------------------------------------------------------------------- + // disable mmap if the directories are not given + + if(ar->config.mmap.enabled && (!ar->config.mmap.cache_dir || !*ar->config.mmap.cache_dir)) { + netdata_log_error("ARAL: '%s' mmap cache directory is not configured properly, disabling mmap.", ar->config.name); + ar->config.mmap.enabled = false; + internal_fatal(true, "ARAL: '%s' mmap cache directory is not configured properly", ar->config.name); + } - ar->config.element_size = aral_allocation_slot_size(ar->config.requested_element_size, false); - optimal_max_page_size(ar); + // ---------------------------------------------------------------------------------------------------------------- + // calculate element size, after adding our pointer - ar->config.max_page_elements = ar->config.requested_max_page_size / ar->config.element_size; + ar->config.element_size = aral_element_slot_size(ar->config.requested_element_size, false); // we write the page pointer just after each element - ar->config.page_ptr_offset = ar->config.element_size - sizeof(uintptr_t); + ar->config.element_ptr_offset = ar->config.element_size - sizeof(uintptr_t); if(ar->config.requested_element_size + sizeof(uintptr_t) > ar->config.element_size) fatal("ARAL: '%s' failed to calculate properly page_ptr_offset: " @@ -912,35 +1006,42 @@ ARAL *aral_create(const char *name, size_t element_size, size_t initial_page_ele "final element size %zu, page_ptr_offset %zu", ar->config.name, ar->config.requested_element_size, sizeof(uintptr_t), SYSTEM_REQUIRED_ALIGNMENT, - ar->config.element_size, ar->config.page_ptr_offset); + ar->config.element_size, ar->config.element_ptr_offset); - //netdata_log_info("ARAL: element size %zu, sizeof(uintptr_t) %zu, natural alignment %zu, final element size %zu, page_ptr_offset %zu", - // ar->element_size, sizeof(uintptr_t), ARAL_NATURAL_ALIGNMENT, ar->internal.element_size, ar->internal.page_ptr_offset); + // ---------------------------------------------------------------------------------------------------------------- + // calculate allocation sizes + ar->config.system_page_size = aral_get_system_page_size(); if (ar->config.initial_page_elements < 2) ar->config.initial_page_elements = 2; - if(ar->config.mmap.enabled && (!ar->config.mmap.cache_dir || !*ar->config.mmap.cache_dir)) { - netdata_log_error("ARAL: '%s' mmap cache directory is not configured properly, disabling mmap.", ar->config.name); - ar->config.mmap.enabled = false; - internal_fatal(true, "ARAL: '%s' mmap cache directory is not configured properly", ar->config.name); - } + if(!ar->config.requested_max_page_size) + ar->config.requested_max_page_size = ar->config.mmap.enabled ? ARAL_MAX_PAGE_SIZE_MMAP : ARAL_MAX_PAGE_SIZE_MALLOC; - uint64_t max_alloc_size; - if(!ar->config.max_page_elements) - max_alloc_size = ar->config.mmap.enabled ? ARAL_MAX_PAGE_SIZE_MMAP : ARAL_MAX_PAGE_SIZE_MALLOC; - else - max_alloc_size = ar->config.max_page_elements * ar->config.element_size; + // calculate the maximum allocation size we will do + ar->config.max_allocation_size = + memory_alignment(ar->config.requested_max_page_size, ar->config.system_page_size); + + // find the minimum page size we will use + size_t min_required_page_size = memory_alignment(sizeof(ARAL_PAGE), SYSTEM_REQUIRED_ALIGNMENT) + 2 * ar->config.element_size; + min_required_page_size = memory_alignment(min_required_page_size, ar->config.system_page_size); + + // make sure the maximum is enough + if(ar->config.max_allocation_size < min_required_page_size) + ar->config.max_allocation_size = min_required_page_size; + + // set the starting allocation size for both marked and unmarked partitions + ar->ops[0].adders.allocation_size = ar->ops[1].adders.allocation_size = min_required_page_size; + + // ---------------------------------------------------------------------------------------------------------------- - ar->config.max_allocation_size = aral_align_alloc_size(ar, max_alloc_size); - ar->ops[0].adders.allocation_size = - ar->ops[1].adders.allocation_size = - aral_align_alloc_size(ar, (uint64_t)ar->config.element_size * ar->config.initial_page_elements); ar->aral_lock.pages_free = NULL; ar->aral_lock.pages_marked_free = NULL; ar->aral_lock.file_number = 0; + // ---------------------------------------------------------------------------------------------------------------- + if(ar->config.mmap.enabled) { char directory_name[FILENAME_MAX + 1]; snprintfz(directory_name, FILENAME_MAX, "%s/array_alloc.mmap", *ar->config.mmap.cache_dir); @@ -972,7 +1073,7 @@ ARAL *aral_create(const char *name, size_t element_size, size_t initial_page_ele return ar; } -// ---------------------------------------------------------------------------- +// -------------------------------------------------------------------------------------------------------------------- // global aral caching #define ARAL_BY_SIZE_MAX_SIZE 1024 @@ -1063,7 +1164,7 @@ void aral_by_size_release(ARAL *ar) { aral_destroy(ar); } -// ---------------------------------------------------------------------------- +// -------------------------------------------------------------------------------------------------------------------- // unittest struct aral_unittest_config { @@ -1159,10 +1260,11 @@ static void *aral_test_thread(void *ptr) { pointers[i] = unittest_aral_malloc(ar, marked); } - size_t increment = elements / ar->config.max_page_elements; + size_t max_page_elements = aral_elements_in_page_size(ar, ar->config.max_allocation_size); + size_t increment = elements / max_page_elements; for (size_t all = increment; all <= elements / 2; all += increment) { - size_t to_free = (all % ar->config.max_page_elements) + 1; + size_t to_free = (all % max_page_elements) + 1; size_t step = elements / to_free; if(!step) step = 1; diff --git a/src/libnetdata/aral/aral.h b/src/libnetdata/aral/aral.h index 474c160519acc6..86de671fd6b5b1 100644 --- a/src/libnetdata/aral/aral.h +++ b/src/libnetdata/aral/aral.h @@ -52,7 +52,7 @@ struct aral_statistics *aral_by_size_statistics(void); size_t aral_by_size_used_bytes(void); size_t aral_used_bytes_from_stats(struct aral_statistics *stats); -size_t aral_optimal_page_size(void); +size_t aral_optimal_malloc_page_size(void); int aral_unittest(size_t elements); diff --git a/src/libnetdata/avl/avl.h b/src/libnetdata/avl/avl.h index 595d6ec6c07c48..c7cc3aa29046f1 100644 --- a/src/libnetdata/avl/avl.h +++ b/src/libnetdata/avl/avl.h @@ -13,7 +13,7 @@ #if defined(AVL_LOCK_WITH_RWLOCK) #define AVL_LOCK_INITIALIZER NETDATA_RWLOCK_INITIALIZER #else -#define AVL_LOCK_INITIALIZER NETDATA_RW_SPINLOCK_INITIALIZER +#define AVL_LOCK_INITIALIZER RW_SPINLOCK_INITIALIZER #endif /* Data structures */ diff --git a/src/libnetdata/clocks/clocks.c b/src/libnetdata/clocks/clocks.c index c658861984ad46..efa39506def650 100644 --- a/src/libnetdata/clocks/clocks.c +++ b/src/libnetdata/clocks/clocks.c @@ -264,7 +264,7 @@ void sleep_to_absolute_time(usec_t usec) { #define HEARTBEAT_RANDOM_OFFSET_UT (350 * USEC_PER_MS) #define HEARTBEAT_ALIGNMENT_STATISTICS_SIZE 20 -static SPINLOCK heartbeat_alignment_spinlock = NETDATA_SPINLOCK_INITIALIZER; +static SPINLOCK heartbeat_alignment_spinlock = SPINLOCK_INITIALIZER; static size_t heartbeat_alignment_id = 0; struct heartbeat_thread_statistics { diff --git a/src/libnetdata/config/appconfig.h b/src/libnetdata/config/appconfig.h index e4ae65e9436311..2fabe91a58cb71 100644 --- a/src/libnetdata/config/appconfig.h +++ b/src/libnetdata/config/appconfig.h @@ -100,7 +100,7 @@ #define CONFIG_SECTION_PROMETHEUS "prometheus:exporter" #define CONFIG_SECTION_HOST_LABEL "host labels" #define EXPORTING_CONF "exporting.conf" -#define CONFIG_SECTION_TELEMETRY "telemetry" +#define CONFIG_SECTION_PULSE "pulse" #define CONFIG_SECTION_DB "db" // these are used to limit the configuration names and values lengths @@ -122,7 +122,7 @@ struct config { #define APPCONFIG_INITIALIZER (struct config) { \ .sections = NULL, \ - .spinlock = NETDATA_SPINLOCK_INITIALIZER, \ + .spinlock = SPINLOCK_INITIALIZER, \ .index = { \ .avl_tree = { \ .root = NULL, \ diff --git a/src/libnetdata/config/appconfig_conf_file.c b/src/libnetdata/config/appconfig_conf_file.c index 1a1149853cf49d..1bdda0c6a0a285 100644 --- a/src/libnetdata/config/appconfig_conf_file.c +++ b/src/libnetdata/config/appconfig_conf_file.c @@ -227,7 +227,7 @@ void appconfig_generate(struct config *root, BUFFER *wb, int only_changed, bool else if(!string_strcmp(sect->name, CONFIG_SECTION_WEBRTC)) pri = 11; // by default, new sections will get pri = 12 (set at the end, below) else if(!string_strcmp(sect->name, CONFIG_SECTION_REGISTRY)) pri = 13; - else if(!string_strcmp(sect->name, CONFIG_SECTION_TELEMETRY)) pri = 14; + else if(!string_strcmp(sect->name, CONFIG_SECTION_PULSE)) pri = 14; else if(!string_strcmp(sect->name, CONFIG_SECTION_PLUGINS)) pri = 15; else if(!string_strcmp(sect->name, CONFIG_SECTION_STATSD)) pri = 16; else if(!string_strncmp(sect->name, "plugin:", 7)) pri = 17; // << change the loop too if you change this diff --git a/src/libnetdata/functions_evloop/functions_evloop.c b/src/libnetdata/functions_evloop/functions_evloop.c index fd0061844fcc34..4aa4a2a2f6a0a2 100644 --- a/src/libnetdata/functions_evloop/functions_evloop.c +++ b/src/libnetdata/functions_evloop/functions_evloop.c @@ -312,17 +312,24 @@ static void *rrd_functions_worker_globals_reader_main(void *arg) { else nd_log(NDLS_COLLECTORS, NDLP_NOTICE, "Received PROGRESS for transaction '%s', but it not available here", transaction); } + else if(keyword && strcmp(keyword, PLUGINSD_CALL_QUIT) == 0) { + *wg->plugin_should_exit = true; + break; + } else nd_log(NDLS_COLLECTORS, NDLP_NOTICE, "Received unknown command: %s", keyword ? keyword : "(unset)"); buffer_flush(buffer); } - if(!(*wg->plugin_should_exit)) + int status = 0; + if(!(*wg->plugin_should_exit)) { nd_log(NDLS_COLLECTORS, NDLP_ERR, "Read error on stdin"); + status = 1; + } *wg->plugin_should_exit = true; - exit(1); + exit(status); } void worker_queue_delete_cb(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) { diff --git a/src/libnetdata/functions_evloop/functions_evloop.h b/src/libnetdata/functions_evloop/functions_evloop.h index ca1e3655e8ebca..9abba0f4473ebe 100644 --- a/src/libnetdata/functions_evloop/functions_evloop.h +++ b/src/libnetdata/functions_evloop/functions_evloop.h @@ -63,6 +63,8 @@ #define PLUGINSD_CALL_FUNCTION_CANCEL "FUNCTION_CANCEL" // cancel a running function transaction #define PLUGINSD_CALL_FUNCTION_PROGRESS "FUNCTION_PROGRESS" // let the function know the user is waiting +#define PLUGINSD_CALL_QUIT "QUIT" // ask the plugin to quit + // dyncfg // enabled with STREAM_CAP_DYNCFG #define PLUGINSD_KEYWORD_CONFIG "CONFIG" diff --git a/src/libnetdata/july/july.c b/src/libnetdata/july/july.c index 56b8494b3dca9a..20770fe66d88d1 100644 --- a/src/libnetdata/july/july.c +++ b/src/libnetdata/july/july.c @@ -44,7 +44,7 @@ static struct { } atomics; } julyl_globals = { .protected = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .available_items = NULL, .available = 0, }, diff --git a/src/libnetdata/libnetdata.c b/src/libnetdata/libnetdata.c index 043bc2febd40aa..62ac5ea08322dc 100644 --- a/src/libnetdata/libnetdata.c +++ b/src/libnetdata/libnetdata.c @@ -450,11 +450,11 @@ void posix_memfree(void *ptr) { void mallocz_release_as_much_memory_to_the_system(void) { #if defined(HAVE_C_MALLOPT) || defined(HAVE_C_MALLOC_TRIM) - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; spinlock_lock(&spinlock); #ifdef HAVE_C_MALLOPT - size_t trim_threshold = aral_optimal_page_size(); + size_t trim_threshold = aral_optimal_malloc_page_size(); mallopt(M_TRIM_THRESHOLD, (int)trim_threshold); #endif @@ -799,7 +799,7 @@ BUFFER *run_command_and_get_output_to_buffer(const char *command, int max_line_l buffer[max_line_length] = '\0'; buffer_strcat(wb, buffer); } - spawn_popen_kill(pi); + spawn_popen_kill(pi, 0); } else { buffer_free(wb); @@ -818,7 +818,7 @@ bool run_command_and_copy_output_to_stdout(const char *command, int max_line_len while (fgets(buffer, max_line_length, spawn_popen_stdout(pi))) fprintf(stdout, "%s", buffer); - spawn_popen_kill(pi); + spawn_popen_kill(pi, 0); } else { netdata_log_error("Failed to execute command '%s'.", command); diff --git a/src/libnetdata/libnetdata.h b/src/libnetdata/libnetdata.h index 02ae57126a9a22..2dc9d6f5970258 100644 --- a/src/libnetdata/libnetdata.h +++ b/src/libnetdata/libnetdata.h @@ -123,6 +123,8 @@ extern const char *netdata_configured_host_prefix; #include "threads/threads.h" #include "locks/locks.h" +#include "locks/spinlock.h" +#include "locks/rw-spinlock.h" #include "completion/completion.h" #include "clocks/clocks.h" #include "simple_pattern/simple_pattern.h" diff --git a/src/libnetdata/local-sockets/local-sockets.h b/src/libnetdata/local-sockets/local-sockets.h index 95529735e01adc..be1d3e93142a19 100644 --- a/src/libnetdata/local-sockets/local-sockets.h +++ b/src/libnetdata/local-sockets/local-sockets.h @@ -1587,7 +1587,7 @@ static inline bool local_sockets_get_namespace_sockets_with_pid(LS_STATE *ls, st spinlock_unlock(&ls->spinlock); } - spawn_server_exec_kill(ls->spawn_server, si); + spawn_server_exec_kill(ls->spawn_server, si, 0); if(ls->config.report && received == 0) __atomic_add_fetch(&ls->stats.namespaces_forks_unresponsive, 1, __ATOMIC_RELAXED); diff --git a/src/libnetdata/locks/locks.c b/src/libnetdata/locks/locks.c index 9df1942c90fb7d..2545eb34ab9929 100644 --- a/src/libnetdata/locks/locks.c +++ b/src/libnetdata/locks/locks.c @@ -220,256 +220,6 @@ int __netdata_rwlock_trywrlock(netdata_rwlock_t *rwlock) { return ret; } -// ---------------------------------------------------------------------------- -// spinlock implementation -// https://www.youtube.com/watch?v=rmGJc9PXpuE&t=41s - -#ifdef SPINLOCK_IMPL_WITH_MUTEX -void spinlock_init(SPINLOCK *spinlock) -{ - netdata_mutex_init(&spinlock->inner); -} -#else -void spinlock_init(SPINLOCK *spinlock) -{ - memset(spinlock, 0, sizeof(SPINLOCK)); -} -#endif - -#ifndef SPINLOCK_IMPL_WITH_MUTEX -static inline void spinlock_lock_internal(SPINLOCK *spinlock) -{ - #ifdef NETDATA_INTERNAL_CHECKS - size_t spins = 0; - #endif - - for(int i = 1; - __atomic_load_n(&spinlock->locked, __ATOMIC_RELAXED) || - __atomic_test_and_set(&spinlock->locked, __ATOMIC_ACQUIRE) - ; i++ - ) { - - #ifdef NETDATA_INTERNAL_CHECKS - spins++; - #endif - - if(unlikely(i % 8 == 0)) { - if(i == 8 * 4) { - i = 0; - yield_the_processor(); - } - else - tinysleep(); - } - } - - // we have the lock - - #ifdef NETDATA_INTERNAL_CHECKS - spinlock->spins += spins; - spinlock->locker_pid = gettid_cached(); - #endif - - nd_thread_spinlock_locked(); -} -#endif // SPINLOCK_IMPL_WITH_MUTEX - -#ifndef SPINLOCK_IMPL_WITH_MUTEX -static inline void spinlock_unlock_internal(SPINLOCK *spinlock) -{ - #ifdef NETDATA_INTERNAL_CHECKS - spinlock->locker_pid = 0; - #endif - - __atomic_clear(&spinlock->locked, __ATOMIC_RELEASE); - - nd_thread_spinlock_unlocked(); -} -#endif // SPINLOCK_IMPL_WITH_MUTEX - -#ifndef SPINLOCK_IMPL_WITH_MUTEX -static inline bool spinlock_trylock_internal(SPINLOCK *spinlock) { - if(!__atomic_load_n(&spinlock->locked, __ATOMIC_RELAXED) && - !__atomic_test_and_set(&spinlock->locked, __ATOMIC_ACQUIRE)) { - // we got the lock - nd_thread_spinlock_locked(); - return true; - } - - return false; -} -#endif // SPINLOCK_IMPL_WITH_MUTEX - -#ifdef SPINLOCK_IMPL_WITH_MUTEX -void spinlock_lock(SPINLOCK *spinlock) -{ - netdata_mutex_lock(&spinlock->inner); -} -#else -void spinlock_lock(SPINLOCK *spinlock) -{ - spinlock_lock_internal(spinlock); -} -#endif - -#ifdef SPINLOCK_IMPL_WITH_MUTEX -void spinlock_unlock(SPINLOCK *spinlock) -{ - netdata_mutex_unlock(&spinlock->inner); -} -#else -void spinlock_unlock(SPINLOCK *spinlock) -{ - spinlock_unlock_internal(spinlock); -} -#endif - -#ifdef SPINLOCK_IMPL_WITH_MUTEX -bool spinlock_trylock(SPINLOCK *spinlock) -{ - return netdata_mutex_trylock(&spinlock->inner) == 0; -} -#else -bool spinlock_trylock(SPINLOCK *spinlock) -{ - return spinlock_trylock_internal(spinlock); -} -#endif - -#ifdef SPINLOCK_IMPL_WITH_MUTEX -void spinlock_lock_cancelable(SPINLOCK *spinlock) -{ - netdata_mutex_lock(&spinlock->inner); -} -#else -void spinlock_lock_cancelable(SPINLOCK *spinlock) -{ - spinlock_lock_internal(spinlock); -} -#endif - -#ifdef SPINLOCK_IMPL_WITH_MUTEX -void spinlock_unlock_cancelable(SPINLOCK *spinlock) -{ - netdata_mutex_unlock(&spinlock->inner); -} -#else -void spinlock_unlock_cancelable(SPINLOCK *spinlock) -{ - spinlock_unlock_internal(spinlock); -} -#endif - -#ifdef SPINLOCK_IMPL_WITH_MUTEX -bool spinlock_trylock_cancelable(SPINLOCK *spinlock) -{ - return netdata_mutex_trylock(&spinlock->inner) == 0; -} -#else -bool spinlock_trylock_cancelable(SPINLOCK *spinlock) -{ - return spinlock_trylock_internal(spinlock); -} -#endif - -// ---------------------------------------------------------------------------- -// rw_spinlock implementation - -void rw_spinlock_init(RW_SPINLOCK *rw_spinlock) { - rw_spinlock->readers = 0; - rw_spinlock->writers_waiting = 0; - spinlock_init(&rw_spinlock->spinlock); -} - -void rw_spinlock_read_lock(RW_SPINLOCK *rw_spinlock) { - while(1) { - spinlock_lock(&rw_spinlock->spinlock); - if (!rw_spinlock->writers_waiting) { - __atomic_add_fetch(&rw_spinlock->readers, 1, __ATOMIC_RELAXED); - spinlock_unlock(&rw_spinlock->spinlock); - break; - } - - spinlock_unlock(&rw_spinlock->spinlock); - yield_the_processor(); // let the writer run - } - - nd_thread_rwspinlock_read_locked(); -} - -void rw_spinlock_read_unlock(RW_SPINLOCK *rw_spinlock) { -#ifndef NETDATA_INTERNAL_CHECKS - __atomic_sub_fetch(&rw_spinlock->readers, 1, __ATOMIC_RELAXED); -#else - int32_t x = __atomic_sub_fetch(&rw_spinlock->readers, 1, __ATOMIC_RELAXED); - if(x < 0) - fatal("RW_SPINLOCK: readers is negative %d", x); -#endif - - nd_thread_rwspinlock_read_unlocked(); -} - -void rw_spinlock_write_lock(RW_SPINLOCK *rw_spinlock) { - size_t spins = 0; - for(size_t i = 1; true ;i++) { - spinlock_lock(&rw_spinlock->spinlock); - - if(__atomic_load_n(&rw_spinlock->readers, __ATOMIC_RELAXED) == 0) { - if(spins != 0) - rw_spinlock->writers_waiting--; - break; - } - - if(spins == 0) - rw_spinlock->writers_waiting++; - - // Busy wait until all readers have released their locks. - spinlock_unlock(&rw_spinlock->spinlock); - if(i == 8 * 2) { - i = 0; - tinysleep(); - } - spins++; - } - - (void)spins; - - nd_thread_rwspinlock_write_locked(); -} - -void rw_spinlock_write_unlock(RW_SPINLOCK *rw_spinlock) { - spinlock_unlock(&rw_spinlock->spinlock); - nd_thread_rwspinlock_write_unlocked(); -} - -bool rw_spinlock_tryread_lock(RW_SPINLOCK *rw_spinlock) { - if(spinlock_trylock(&rw_spinlock->spinlock)) { - __atomic_add_fetch(&rw_spinlock->readers, 1, __ATOMIC_RELAXED); - spinlock_unlock(&rw_spinlock->spinlock); - nd_thread_rwspinlock_read_locked(); - return true; - } - - return false; -} - -bool rw_spinlock_trywrite_lock(RW_SPINLOCK *rw_spinlock) { - if(spinlock_trylock(&rw_spinlock->spinlock)) { - if (__atomic_load_n(&rw_spinlock->readers, __ATOMIC_RELAXED) == 0) { - // No readers, we've successfully acquired the write lock - nd_thread_rwspinlock_write_locked(); - return true; - } - else { - // There are readers, unlock the spinlock and return false - spinlock_unlock(&rw_spinlock->spinlock); - } - } - - return false; -} - - #ifdef NETDATA_TRACE_RWLOCKS // ---------------------------------------------------------------------------- diff --git a/src/libnetdata/locks/locks.h b/src/libnetdata/locks/locks.h index e5f3a7e2db2fee..9b6f2654ac4cd3 100644 --- a/src/libnetdata/locks/locks.h +++ b/src/libnetdata/locks/locks.h @@ -13,54 +13,6 @@ typedef pthread_mutex_t netdata_mutex_t; #define NETDATA_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER -#ifdef SPINLOCK_IMPL_WITH_MUTEX - typedef struct netdata_spinlock - { - netdata_mutex_t inner; - } SPINLOCK; -#else - typedef struct netdata_spinlock - { - bool locked; - #ifdef NETDATA_INTERNAL_CHECKS - pid_t locker_pid; - size_t spins; - #endif - } SPINLOCK; -#endif - -#ifdef SPINLOCK_IMPL_WITH_MUTEX -#define NETDATA_SPINLOCK_INITIALIZER { .inner = PTHREAD_MUTEX_INITIALIZER } -#else -#define NETDATA_SPINLOCK_INITIALIZER { .locked = false } -#endif - -void spinlock_init(SPINLOCK *spinlock); -void spinlock_lock(SPINLOCK *spinlock); -void spinlock_unlock(SPINLOCK *spinlock); -bool spinlock_trylock(SPINLOCK *spinlock); - -void spinlock_lock_cancelable(SPINLOCK *spinlock); -void spinlock_unlock_cancelable(SPINLOCK *spinlock); -bool spinlock_trylock_cancelable(SPINLOCK *spinlock); - -typedef struct netdata_rw_spinlock { - int32_t readers; - int32_t writers_waiting; - SPINLOCK spinlock; -} RW_SPINLOCK; - -#define NETDATA_RW_SPINLOCK_INITIALIZER \ - { .readers = 0, .spinlock = NETDATA_SPINLOCK_INITIALIZER } - -void rw_spinlock_init(RW_SPINLOCK *rw_spinlock); -void rw_spinlock_read_lock(RW_SPINLOCK *rw_spinlock); -void rw_spinlock_read_unlock(RW_SPINLOCK *rw_spinlock); -void rw_spinlock_write_lock(RW_SPINLOCK *rw_spinlock); -void rw_spinlock_write_unlock(RW_SPINLOCK *rw_spinlock); -bool rw_spinlock_tryread_lock(RW_SPINLOCK *rw_spinlock); -bool rw_spinlock_trywrite_lock(RW_SPINLOCK *rw_spinlock); - #ifdef NETDATA_TRACE_RWLOCKS typedef enum { diff --git a/src/libnetdata/locks/rw-spinlock.c b/src/libnetdata/locks/rw-spinlock.c new file mode 100644 index 00000000000000..d5b87ed143aa78 --- /dev/null +++ b/src/libnetdata/locks/rw-spinlock.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "libnetdata/libnetdata.h" + +// ---------------------------------------------------------------------------- +// rw_spinlock implementation + +void rw_spinlock_init_with_trace(RW_SPINLOCK *rw_spinlock, const char *func) { + rw_spinlock->readers = 0; + rw_spinlock->writers_waiting = 0; + spinlock_init_with_trace(&rw_spinlock->spinlock, func); +} + +void rw_spinlock_read_lock_with_trace(RW_SPINLOCK *rw_spinlock, const char *func) { + size_t spins = 0; + while(1) { + spinlock_lock_with_trace(&rw_spinlock->spinlock, func); + if (!rw_spinlock->writers_waiting) { + __atomic_add_fetch(&rw_spinlock->readers, 1, __ATOMIC_RELAXED); + spinlock_unlock_with_trace(&rw_spinlock->spinlock, func); + break; + } + + spinlock_unlock_with_trace(&rw_spinlock->spinlock, func); + yield_the_processor(); // let the writer run + spins++; + } + + worker_spinlock_contention(func, spins); + nd_thread_rwspinlock_read_locked(); +} + +void rw_spinlock_read_unlock_with_trace(RW_SPINLOCK *rw_spinlock, const char *func __maybe_unused) { +#ifndef NETDATA_INTERNAL_CHECKS + __atomic_sub_fetch(&rw_spinlock->readers, 1, __ATOMIC_RELAXED); +#else + int32_t x = __atomic_sub_fetch(&rw_spinlock->readers, 1, __ATOMIC_RELAXED); + if(x < 0) + fatal("RW_SPINLOCK: readers is negative %d", x); +#endif + + nd_thread_rwspinlock_read_unlocked(); +} + +void rw_spinlock_write_lock_with_trace(RW_SPINLOCK *rw_spinlock, const char *func) { + size_t spins = 0; + for(size_t i = 1; true ;i++) { + spinlock_lock_with_trace(&rw_spinlock->spinlock, func); + + if(__atomic_load_n(&rw_spinlock->readers, __ATOMIC_RELAXED) == 0) { + if(spins != 0) + rw_spinlock->writers_waiting--; + break; + } + + if(spins == 0) + rw_spinlock->writers_waiting++; + + // Busy wait until all readers have released their locks. + spinlock_unlock_with_trace(&rw_spinlock->spinlock, func); + if(i == 8 * 2) { + i = 0; + tinysleep(); + } + spins++; + } + + worker_spinlock_contention(func, spins); + nd_thread_rwspinlock_write_locked(); +} + +void rw_spinlock_write_unlock_with_trace(RW_SPINLOCK *rw_spinlock, const char *func) { + spinlock_unlock_with_trace(&rw_spinlock->spinlock, func); + nd_thread_rwspinlock_write_unlocked(); +} + +bool rw_spinlock_tryread_lock_with_trace(RW_SPINLOCK *rw_spinlock, const char *func) { + if(spinlock_trylock_with_trace(&rw_spinlock->spinlock, func)) { + __atomic_add_fetch(&rw_spinlock->readers, 1, __ATOMIC_RELAXED); + spinlock_unlock_with_trace(&rw_spinlock->spinlock, func); + nd_thread_rwspinlock_read_locked(); + return true; + } + + return false; +} + +bool rw_spinlock_trywrite_lock_with_trace(RW_SPINLOCK *rw_spinlock, const char *func) { + if(spinlock_trylock_with_trace(&rw_spinlock->spinlock, func)) { + if (__atomic_load_n(&rw_spinlock->readers, __ATOMIC_RELAXED) == 0) { + // No readers, we've successfully acquired the write lock + nd_thread_rwspinlock_write_locked(); + return true; + } + else { + // There are readers, unlock the spinlock and return false + spinlock_unlock_with_trace(&rw_spinlock->spinlock, func); + } + } + + return false; +} + diff --git a/src/libnetdata/locks/rw-spinlock.h b/src/libnetdata/locks/rw-spinlock.h new file mode 100644 index 00000000000000..e641eab7962691 --- /dev/null +++ b/src/libnetdata/locks/rw-spinlock.h @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_RW_SPINLOCK_H +#define NETDATA_RW_SPINLOCK_H + +#include "libnetdata/common.h" +#include "spinlock.h" + +typedef struct netdata_rw_spinlock { + int32_t readers; + int32_t writers_waiting; + SPINLOCK spinlock; +} RW_SPINLOCK; + +#define RW_SPINLOCK_INITIALIZER { .readers = 0, .spinlock = SPINLOCK_INITIALIZER} + +void rw_spinlock_init_with_trace(RW_SPINLOCK *rw_spinlock, const char *func); +void rw_spinlock_read_lock_with_trace(RW_SPINLOCK *rw_spinlock, const char *func); +void rw_spinlock_read_unlock_with_trace(RW_SPINLOCK *rw_spinlock, const char *func); +void rw_spinlock_write_lock_with_trace(RW_SPINLOCK *rw_spinlock, const char *func); +void rw_spinlock_write_unlock_with_trace(RW_SPINLOCK *rw_spinlock, const char *func); +bool rw_spinlock_tryread_lock_with_trace(RW_SPINLOCK *rw_spinlock, const char *func); +bool rw_spinlock_trywrite_lock_with_trace(RW_SPINLOCK *rw_spinlock, const char *func); + + +#define rw_spinlock_init(rw_spinlock) rw_spinlock_init_with_trace(rw_spinlock, __FUNCTION__) +#define rw_spinlock_read_lock(rw_spinlock) rw_spinlock_read_lock_with_trace(rw_spinlock, __FUNCTION__) +#define rw_spinlock_read_unlock(rw_spinlock) rw_spinlock_read_unlock_with_trace(rw_spinlock, __FUNCTION__) +#define rw_spinlock_write_lock(rw_spinlock) rw_spinlock_write_lock_with_trace(rw_spinlock, __FUNCTION__) +#define rw_spinlock_write_unlock(rw_spinlock) rw_spinlock_write_unlock_with_trace(rw_spinlock, __FUNCTION__) +#define rw_spinlock_tryread_lock(rw_spinlock) rw_spinlock_tryread_lock_with_trace(rw_spinlock, __FUNCTION__) +#define rw_spinlock_trywrite_lock(rw_spinlock) rw_spinlock_trywrite_lock_with_trace(rw_spinlock, __FUNCTION__) + +#endif //NETDATA_RW_SPINLOCK_H diff --git a/src/libnetdata/locks/spinlock.c b/src/libnetdata/locks/spinlock.c new file mode 100644 index 00000000000000..d1df5c6194114c --- /dev/null +++ b/src/libnetdata/locks/spinlock.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "libnetdata/libnetdata.h" + +// ---------------------------------------------------------------------------- +// spinlock implementation +// https://www.youtube.com/watch?v=rmGJc9PXpuE&t=41s + +#ifndef SPINLOCK_IMPL_WITH_MUTEX + +void spinlock_init_with_trace(SPINLOCK *spinlock, const char *func __maybe_unused) { + memset(spinlock, 0, sizeof(SPINLOCK)); +} + +void spinlock_lock_with_trace(SPINLOCK *spinlock, const char *func) { + size_t spins = 0; + + for(int i = 1; + __atomic_load_n(&spinlock->locked, __ATOMIC_RELAXED) || + __atomic_test_and_set(&spinlock->locked, __ATOMIC_ACQUIRE) + ; i++ + ) { + + spins++; + if(unlikely(i % 8 == 0)) { + i = 0; + tinysleep(); + } + } + + // we have the lock + +#ifdef NETDATA_INTERNAL_CHECKS + spinlock->spins += spins; + spinlock->locker_pid = gettid_cached(); +#endif + + nd_thread_spinlock_locked(); + worker_spinlock_contention(func, spins); +} + +void spinlock_unlock_with_trace(SPINLOCK *spinlock, const char *func __maybe_unused) { +#ifdef NETDATA_INTERNAL_CHECKS + spinlock->locker_pid = 0; +#endif + + __atomic_clear(&spinlock->locked, __ATOMIC_RELEASE); + + nd_thread_spinlock_unlocked(); +} + +bool spinlock_trylock_with_trace(SPINLOCK *spinlock, const char *func __maybe_unused) { + if(!__atomic_load_n(&spinlock->locked, __ATOMIC_RELAXED) && + !__atomic_test_and_set(&spinlock->locked, __ATOMIC_ACQUIRE)) { + // we got the lock + nd_thread_spinlock_locked(); + return true; + } + + return false; +} + +#endif // SPINLOCK_IMPL_WITH_MUTEX diff --git a/src/libnetdata/locks/spinlock.h b/src/libnetdata/locks/spinlock.h new file mode 100644 index 00000000000000..f541e264631a94 --- /dev/null +++ b/src/libnetdata/locks/spinlock.h @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_SPINLOCK_H +#define NETDATA_SPINLOCK_H + +#include "libnetdata/common.h" + +#ifdef SPINLOCK_IMPL_WITH_MUTEX +typedef struct netdata_spinlock +{ + netdata_mutex_t inner; +} SPINLOCK; +#else +typedef struct netdata_spinlock +{ + bool locked; +#ifdef NETDATA_INTERNAL_CHECKS + pid_t locker_pid; + size_t spins; +#endif +} SPINLOCK; +#endif + +#ifdef SPINLOCK_IMPL_WITH_MUTEX +#define SPINLOCK_INITIALIZER { .inner = PTHREAD_MUTEX_INITIALIZER } + +#define spinlock_lock(spinlock) netdata_mutex_lock(&((spinlock)->inner)) +#define spinlock_unlock(spinlock) netdata_mutex_unlock(&((spinlock)->inner)) +#define spinlock_trylock(spinlock) (netdata_mutex_trylock(&((spinlock)->inner)) == 0) +#define spinlock_init(spinlock) netdata_mutex_init(&((spinlock)->inner) +#else +#define SPINLOCK_INITIALIZER { .locked = false } + +void spinlock_init_with_trace(SPINLOCK *spinlock, const char *func); +#define spinlock_init(spinlock) spinlock_init_with_trace(spinlock, __FUNCTION__) + +void spinlock_lock_with_trace(SPINLOCK *spinlock, const char *func); +#define spinlock_lock(spinlock) spinlock_lock_with_trace(spinlock, __FUNCTION__) + +void spinlock_unlock_with_trace(SPINLOCK *spinlock, const char *func __maybe_unused); +#define spinlock_unlock(spinlock) spinlock_unlock_with_trace(spinlock, __FUNCTION__) + +bool spinlock_trylock_with_trace(SPINLOCK *spinlock, const char *func __maybe_unused); +#define spinlock_trylock(spinlock) spinlock_trylock_with_trace(spinlock, __FUNCTION__) + +#endif + +#endif //NETDATA_SPINLOCK_H diff --git a/src/libnetdata/log/nd_log-internals.c b/src/libnetdata/log/nd_log-internals.c index 97f521fad999b8..c57c7f72c3345b 100644 --- a/src/libnetdata/log/nd_log-internals.c +++ b/src/libnetdata/log/nd_log-internals.c @@ -301,16 +301,16 @@ struct nd_log nd_log = { }, #endif .std_output = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .initialized = false, }, .std_error = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .initialized = false, }, .sources = { [NDLS_UNSET] = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .method = NDLM_DISABLED, .format = NDLF_JOURNAL, .filename = NULL, @@ -320,7 +320,7 @@ struct nd_log nd_log = { .limits = ND_LOG_LIMITS_UNLIMITED, }, [NDLS_ACCESS] = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .method = NDLM_DEFAULT, .format = NDLF_LOGFMT, .filename = LOG_DIR "/access.log", @@ -330,7 +330,7 @@ struct nd_log nd_log = { .limits = ND_LOG_LIMITS_UNLIMITED, }, [NDLS_ACLK] = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .method = NDLM_FILE, .format = NDLF_LOGFMT, .filename = LOG_DIR "/aclk.log", @@ -340,7 +340,7 @@ struct nd_log nd_log = { .limits = ND_LOG_LIMITS_UNLIMITED, }, [NDLS_COLLECTORS] = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .method = NDLM_DEFAULT, .format = NDLF_LOGFMT, .filename = LOG_DIR "/collector.log", @@ -350,7 +350,7 @@ struct nd_log nd_log = { .limits = ND_LOG_LIMITS_DEFAULT, }, [NDLS_DEBUG] = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .method = NDLM_DISABLED, .format = NDLF_LOGFMT, .filename = LOG_DIR "/debug.log", @@ -360,7 +360,7 @@ struct nd_log nd_log = { .limits = ND_LOG_LIMITS_UNLIMITED, }, [NDLS_DAEMON] = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .method = NDLM_DEFAULT, .filename = LOG_DIR "/daemon.log", .format = NDLF_LOGFMT, @@ -370,7 +370,7 @@ struct nd_log nd_log = { .limits = ND_LOG_LIMITS_DEFAULT, }, [NDLS_HEALTH] = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .method = NDLM_DEFAULT, .format = NDLF_LOGFMT, .filename = LOG_DIR "/health.log", diff --git a/src/libnetdata/log/nd_log-to-windows-events.c b/src/libnetdata/log/nd_log-to-windows-events.c index f32289daa57f54..673691d9ea3d29 100644 --- a/src/libnetdata/log/nd_log-to-windows-events.c +++ b/src/libnetdata/log/nd_log-to-windows-events.c @@ -439,7 +439,7 @@ static bool nd_logger_windows(struct nd_log_source *source, struct log_field *fi CLEAN_BUFFER *tmp = NULL; - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; spinlock_lock(&spinlock); wevt_generate_all_fields_unsafe(fields, fields_max, &tmp); diff --git a/src/libnetdata/os/random.c b/src/libnetdata/os/random.c index 125e1cdb57895b..18cfbd2d4ecb24 100644 --- a/src/libnetdata/os/random.c +++ b/src/libnetdata/os/random.c @@ -3,7 +3,7 @@ #include "libnetdata/libnetdata.h" #if !defined(HAVE_ARC4RANDOM_BUF) && !defined(HAVE_RAND_S) -static SPINLOCK random_lock = NETDATA_SPINLOCK_INITIALIZER; +static SPINLOCK random_lock = SPINLOCK_INITIALIZER; static __attribute__((constructor)) void random_seed() { // Use current time and process ID to create a high-entropy seed struct timeval tv; diff --git a/src/libnetdata/os/system-maps/cache-host-users-and-groups.c b/src/libnetdata/os/system-maps/cache-host-users-and-groups.c index 53825fd35b75fc..8122835ab0bdd7 100644 --- a/src/libnetdata/os/system-maps/cache-host-users-and-groups.c +++ b/src/libnetdata/os/system-maps/cache-host-users-and-groups.c @@ -53,7 +53,7 @@ static size_t read_passwd_or_group(const char *filename, struct timespec *last_m void update_cached_host_users(void) { if(!netdata_configured_host_prefix || !*netdata_configured_host_prefix) return; - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; if(!spinlock_trylock(&spinlock)) return; char filename[FILENAME_MAX]; @@ -78,7 +78,7 @@ void update_cached_host_users(void) { void update_cached_host_groups(void) { if(!netdata_configured_host_prefix || !*netdata_configured_host_prefix) return; - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; if(!spinlock_trylock(&spinlock)) return; char filename[FILENAME_MAX]; diff --git a/src/libnetdata/os/system-maps/cached-gid-groupname.c b/src/libnetdata/os/system-maps/cached-gid-groupname.c index 3fabe94a2e6b0e..070a9337623d69 100644 --- a/src/libnetdata/os/system-maps/cached-gid-groupname.c +++ b/src/libnetdata/os/system-maps/cached-gid-groupname.c @@ -19,7 +19,7 @@ static struct { SIMPLE_HASHTABLE_GROUPNAMES_CACHE ht; } group_cache = { .initialized = false, - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .ht = { 0 }, }; diff --git a/src/libnetdata/os/system-maps/cached-sid-username.c b/src/libnetdata/os/system-maps/cached-sid-username.c index a0f90c546f7b8c..f3011ea7af59da 100644 --- a/src/libnetdata/os/system-maps/cached-sid-username.c +++ b/src/libnetdata/os/system-maps/cached-sid-username.c @@ -35,7 +35,7 @@ static struct { SPINLOCK spinlock; struct simple_hashtable_SID hashtable; } sid_globals = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .hashtable = { 0 }, }; diff --git a/src/libnetdata/os/system-maps/cached-uid-username.c b/src/libnetdata/os/system-maps/cached-uid-username.c index 35d93f2f063a20..e2702a6ea2c98f 100644 --- a/src/libnetdata/os/system-maps/cached-uid-username.c +++ b/src/libnetdata/os/system-maps/cached-uid-username.c @@ -19,7 +19,7 @@ static struct { SIMPLE_HASHTABLE_USERNAMES_CACHE ht; } user_cache = { .initialized = false, - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .ht = { 0 }, }; diff --git a/src/libnetdata/os/windows-perflib/perflib-names.c b/src/libnetdata/os/windows-perflib/perflib-names.c index 18ff2af655b85e..70251d06421684 100644 --- a/src/libnetdata/os/windows-perflib/perflib-names.c +++ b/src/libnetdata/os/windows-perflib/perflib-names.c @@ -33,7 +33,7 @@ static struct { struct simple_hashtable_PERFLIB hashtable; FILETIME lastWriteTime; } names_globals = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .size = 0, .array = NULL, }; diff --git a/src/libnetdata/query_progress/progress.c b/src/libnetdata/query_progress/progress.c index 157f20f112a49d..aed0fa74cb92a8 100644 --- a/src/libnetdata/query_progress/progress.c +++ b/src/libnetdata/query_progress/progress.c @@ -72,7 +72,7 @@ static struct progress { } progress = { .initialized = false, - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, }; SIMPLE_HASHTABLE_HASH query_hash(nd_uuid_t *transaction) { diff --git a/src/libnetdata/socket/nd-poll.c b/src/libnetdata/socket/nd-poll.c index c731ff209a1ce8..af1f7764297b86 100644 --- a/src/libnetdata/socket/nd-poll.c +++ b/src/libnetdata/socket/nd-poll.c @@ -90,16 +90,18 @@ int nd_poll_wait(nd_poll_t *ndpl, int timeout_ms, nd_poll_result_t *result) { int n = epoll_wait(ndpl->epoll_fd, &ndpl->ev[0], _countof(ndpl->ev), timeout_ms); if(unlikely(n <= 0)) { - if (n < 0) { - result->events = ND_POLL_OTHER_ERROR; - result->data = NULL; - return -1; - } - else { + if (n == 0) { result->events = ND_POLL_TIMEOUT; result->data = NULL; return 0; } + + if(errno == EINTR || errno == EAGAIN) + continue; + + result->events = ND_POLL_OTHER_ERROR; + result->data = NULL; + return -1; } ndpl->used = n; diff --git a/src/libnetdata/socket/nd-sock.h b/src/libnetdata/socket/nd-sock.h index 03c2e38e1083dc..995f9e00c1792d 100644 --- a/src/libnetdata/socket/nd-sock.h +++ b/src/libnetdata/socket/nd-sock.h @@ -83,7 +83,6 @@ static inline ssize_t nd_sock_write(ND_SOCK *s, const void *buf, size_t num, siz ssize_t rc; do { - errno_clear(); if (nd_sock_is_ssl(s)) rc = netdata_ssl_write(&s->ssl, buf, num); else diff --git a/src/libnetdata/socket/security.c b/src/libnetdata/socket/security.c index 56e5d7c7bb0277..f20eed8a788c17 100644 --- a/src/libnetdata/socket/security.c +++ b/src/libnetdata/socket/security.c @@ -606,7 +606,7 @@ static SSL_CTX * netdata_ssl_create_server_ctx(unsigned long mode) { * NETDATA_SSL_CONTEXT_EXPORTING - Starts the OpenTSDB context */ void netdata_ssl_initialize_ctx(int selector) { - static SPINLOCK sp = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK sp = SPINLOCK_INITIALIZER; spinlock_lock(&sp); switch (selector) { diff --git a/src/libnetdata/spawn_server/spawn-tester.c b/src/libnetdata/spawn_server/spawn-tester.c index fbd9431ac72fd2..5ebbd4bfec9d78 100644 --- a/src/libnetdata/spawn_server/spawn-tester.c +++ b/src/libnetdata/spawn_server/spawn-tester.c @@ -108,7 +108,7 @@ void test_int_fds_plugin_kill_to_stop(SPAWN_SERVER *server, const char *argv0) { } fprintf(stderr, "\n"); - int code = spawn_server_exec_kill(server, si); + int code = spawn_server_exec_kill(server, si, 0); nd_log(NDLS_COLLECTORS, NDLP_ERR, "child exited with code %d", @@ -162,7 +162,7 @@ void test_popen_plugin_kill_to_stop(const char *argv0) { } fprintf(stderr, "\n"); - int code = spawn_popen_kill(pi); + int code = spawn_popen_kill(pi, 0); nd_log(NDLS_COLLECTORS, NDLP_ERR, "child exited with code %d", diff --git a/src/libnetdata/spawn_server/spawn_popen.c b/src/libnetdata/spawn_server/spawn_popen.c index b8ea0afe6ea952..f584dfb8c579fa 100644 --- a/src/libnetdata/spawn_server/spawn_popen.c +++ b/src/libnetdata/spawn_server/spawn_popen.c @@ -9,7 +9,7 @@ struct popen_instance { }; SPAWN_SERVER *netdata_main_spawn_server = NULL; -static SPINLOCK netdata_main_spawn_server_spinlock = NETDATA_SPINLOCK_INITIALIZER; +static SPINLOCK netdata_main_spawn_server_spinlock = SPINLOCK_INITIALIZER; bool netdata_main_spawn_server_init(const char *name, int argc, const char **argv) { if(netdata_main_spawn_server == NULL) { @@ -180,11 +180,11 @@ int spawn_popen_wait(POPEN_INSTANCE *pi) { return spawn_popen_status_rc(status); } -int spawn_popen_kill(POPEN_INSTANCE *pi) { +int spawn_popen_kill(POPEN_INSTANCE *pi, int timeout_ms) { if(!pi) return -1; spawn_popen_close_files(pi); - int status = spawn_server_exec_kill(netdata_main_spawn_server, pi->si); + int status = spawn_server_exec_kill(netdata_main_spawn_server, pi->si, timeout_ms); freez(pi); return spawn_popen_status_rc(status); } diff --git a/src/libnetdata/spawn_server/spawn_popen.h b/src/libnetdata/spawn_server/spawn_popen.h index 5c00f32ff18bf1..6177622df0186f 100644 --- a/src/libnetdata/spawn_server/spawn_popen.h +++ b/src/libnetdata/spawn_server/spawn_popen.h @@ -15,7 +15,7 @@ POPEN_INSTANCE *spawn_popen_run(const char *cmd); POPEN_INSTANCE *spawn_popen_run_argv(const char **argv); POPEN_INSTANCE *spawn_popen_run_variadic(const char *cmd, ...); int spawn_popen_wait(POPEN_INSTANCE *pi); -int spawn_popen_kill(POPEN_INSTANCE *pi); +int spawn_popen_kill(POPEN_INSTANCE *pi, int timeout_ms); pid_t spawn_popen_pid(POPEN_INSTANCE *pi); int spawn_popen_read_fd(POPEN_INSTANCE *pi); diff --git a/src/libnetdata/spawn_server/spawn_server.h b/src/libnetdata/spawn_server/spawn_server.h index e68a53ab43a9d5..82f936d276cb6a 100644 --- a/src/libnetdata/spawn_server/spawn_server.h +++ b/src/libnetdata/spawn_server/spawn_server.h @@ -42,7 +42,7 @@ void spawn_server_destroy(SPAWN_SERVER *server); pid_t spawn_server_pid(SPAWN_SERVER *server); SPAWN_INSTANCE* spawn_server_exec(SPAWN_SERVER *server, int stderr_fd, int custom_fd, const char **argv, const void *data, size_t data_size, SPAWN_INSTANCE_TYPE type); -int spawn_server_exec_kill(SPAWN_SERVER *server, SPAWN_INSTANCE *si); +int spawn_server_exec_kill(SPAWN_SERVER *server, SPAWN_INSTANCE *si, int timeout_ms); int spawn_server_exec_wait(SPAWN_SERVER *server, SPAWN_INSTANCE *si); int spawn_server_instance_read_fd(SPAWN_INSTANCE *si); diff --git a/src/libnetdata/spawn_server/spawn_server_libuv.c b/src/libnetdata/spawn_server/spawn_server_libuv.c index e01c5407eaf703..49eba349dad9df 100644 --- a/src/libnetdata/spawn_server/spawn_server_libuv.c +++ b/src/libnetdata/spawn_server/spawn_server_libuv.c @@ -361,7 +361,7 @@ SPAWN_INSTANCE* spawn_server_exec(SPAWN_SERVER *server, int stderr_fd __maybe_un return item.instance; } -int spawn_server_exec_kill(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE *si) { +int spawn_server_exec_kill(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE *si, int timeout_ms __maybe_unused) { if(!si) return -1; // close all pipe descriptors to force the child to exit diff --git a/src/libnetdata/spawn_server/spawn_server_nofork.c b/src/libnetdata/spawn_server/spawn_server_nofork.c index 9986740debd21c..5c6a7505903c44 100644 --- a/src/libnetdata/spawn_server/spawn_server_nofork.c +++ b/src/libnetdata/spawn_server/spawn_server_nofork.c @@ -1197,9 +1197,20 @@ int spawn_server_exec_wait(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE * return rc; } -int spawn_server_exec_kill(SPAWN_SERVER *server, SPAWN_INSTANCE *instance) { +int spawn_server_exec_kill(SPAWN_SERVER *server, SPAWN_INSTANCE *instance, int timeout_ms) { + if(instance->write_fd != -1) { close(instance->write_fd); instance->write_fd = -1; } + if(instance->read_fd != -1) { close(instance->read_fd); instance->read_fd = -1; } + + if(timeout_ms > 0) { + short revents; + NETDATA_SSL ssl = { 0 }; + wait_on_socket_or_cancel_with_timeout(&ssl, instance->sock, timeout_ms, POLLIN, &revents); + } + // kill the child, if it is still running - if(instance->child_pid) kill(instance->child_pid, SIGTERM); + if(instance->child_pid) + kill(instance->child_pid, SIGTERM); + return spawn_server_exec_wait(server, instance); } diff --git a/src/libnetdata/spawn_server/spawn_server_posix.c b/src/libnetdata/spawn_server/spawn_server_posix.c index f96921bb98b5e4..39160f53540c01 100644 --- a/src/libnetdata/spawn_server/spawn_server_posix.c +++ b/src/libnetdata/spawn_server/spawn_server_posix.c @@ -22,7 +22,7 @@ static struct { SPINLOCK spinlock; SPAWN_INSTANCE *instances; } spawn_globals = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .instances = NULL, }; @@ -163,7 +163,7 @@ SPAWN_INSTANCE* spawn_server_exec(SPAWN_SERVER *server, int stderr_fd, int custo // unfortunately, on CYGWIN/MSYS posix_spawn() is not thread safe // so, we run it one by one. - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; spinlock_lock(&spinlock); int fds[3] = { stdin_pipe[PIPE_READ], stdout_pipe[PIPE_WRITE], stderr_fd }; @@ -208,7 +208,7 @@ SPAWN_INSTANCE* spawn_server_exec(SPAWN_SERVER *server, int stderr_fd, int custo return si; } -int spawn_server_exec_kill(SPAWN_SERVER *server, SPAWN_INSTANCE *si) { +int spawn_server_exec_kill(SPAWN_SERVER *server, SPAWN_INSTANCE *si, int timeout_ms __maybe_unused) { if (!si) return -1; if (kill(si->child_pid, SIGTERM)) diff --git a/src/libnetdata/spawn_server/spawn_server_windows.c b/src/libnetdata/spawn_server/spawn_server_windows.c index f80925a24b5348..9ce0c6b2b163b1 100644 --- a/src/libnetdata/spawn_server/spawn_server_windows.c +++ b/src/libnetdata/spawn_server/spawn_server_windows.c @@ -144,7 +144,7 @@ int set_fd_blocking(int fd) { //} SPAWN_INSTANCE* spawn_server_exec(SPAWN_SERVER *server, int stderr_fd __maybe_unused, int custom_fd __maybe_unused, const char **argv, const void *data __maybe_unused, size_t data_size __maybe_unused, SPAWN_INSTANCE_TYPE type) { - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; if (type != SPAWN_INSTANCE_TYPE_EXEC) return NULL; @@ -393,22 +393,20 @@ int map_status_code_to_signal(DWORD status_code) { } } -int spawn_server_exec_kill(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE *si) { - if(si->child_pid != -1 && kill(si->child_pid, SIGTERM) != 0) - nd_log(NDLS_COLLECTORS, NDLP_ERR, - "SPAWN PARENT: child of request No %zu, pid %d (winpid %u), failed to be killed", - si->request_id, (int)si->child_pid, si->dwProcessId); - +int spawn_server_exec_kill(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE *si, int timeout_ms __maybe_unused) { // this gives some warnings at the spawn-tester, but it is generally better // to have them, to avoid abnormal shutdown of the plugins if(si->read_fd != -1) { close(si->read_fd); si->read_fd = -1; } if(si->write_fd != -1) { close(si->write_fd); si->write_fd = -1; } - if(si->stderr_fd != -1) { - if(!log_forwarder_del_and_close_fd(server->log_forwarder, si->stderr_fd)) - close(si->stderr_fd); - si->stderr_fd = -1; - } + if(timeout_ms > 0) + WaitForSingleObject(si->process_handle, timeout_ms); + + errno_clear(); + if(si->child_pid != -1 && kill(si->child_pid, SIGTERM) != 0) + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN PARENT: child of request No %zu, pid %d (winpid %u), failed to be killed", + si->request_id, (int)si->child_pid, si->dwProcessId); errno_clear(); if(TerminateProcess(si->process_handle, STATUS_CONTROL_C_EXIT) == 0) @@ -419,12 +417,6 @@ int spawn_server_exec_kill(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE * errno_clear(); TerminateChildProcesses(si); - return spawn_server_exec_wait(server, si); -} - -int spawn_server_exec_wait(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE *si) { - if(si->read_fd != -1) { close(si->read_fd); si->read_fd = -1; } - if(si->write_fd != -1) { close(si->write_fd); si->write_fd = -1; } if(si->stderr_fd != -1) { if(!log_forwarder_del_and_close_fd(server->log_forwarder, si->stderr_fd)) close(si->stderr_fd); @@ -432,6 +424,13 @@ int spawn_server_exec_wait(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE * si->stderr_fd = -1; } + return spawn_server_exec_wait(server, si); +} + +int spawn_server_exec_wait(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE *si) { + if(si->read_fd != -1) { close(si->read_fd); si->read_fd = -1; } + if(si->write_fd != -1) { close(si->write_fd); si->write_fd = -1; } + // wait for the process to end WaitForSingleObject(si->process_handle, INFINITE); @@ -449,6 +448,13 @@ int spawn_server_exec_wait(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE * if(err) LocalFree(err); + if(si->stderr_fd != -1) { + if(!log_forwarder_del_and_close_fd(server->log_forwarder, si->stderr_fd)) + close(si->stderr_fd); + + si->stderr_fd = -1; + } + freez(si); return map_status_code_to_signal(exit_code); } diff --git a/src/libnetdata/threads/threads.c b/src/libnetdata/threads/threads.c index f71972eb673ece..9f47343af34c2f 100644 --- a/src/libnetdata/threads/threads.c +++ b/src/libnetdata/threads/threads.c @@ -53,11 +53,11 @@ static struct { pthread_attr_t *attr; } threads_globals = { .exited = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .list = NULL, }, .running = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .list = NULL, }, .attr = NULL, diff --git a/src/libnetdata/worker_utilization/worker_utilization.c b/src/libnetdata/worker_utilization/worker_utilization.c index d30434d66d1910..34e41a64682d2d 100644 --- a/src/libnetdata/worker_utilization/worker_utilization.c +++ b/src/libnetdata/worker_utilization/worker_utilization.c @@ -20,6 +20,15 @@ struct worker_job_type { NETDATA_DOUBLE custom_value; }; +struct worker_spinlock { + const char *function; + size_t locks; + size_t spins; + + size_t statistics_last_locks; + size_t statistics_last_spins; +}; + struct worker { pid_t pid; const char *tag; @@ -40,6 +49,9 @@ struct worker { struct worker_job_type per_job_type[WORKER_UTILIZATION_MAX_JOB_TYPES]; + size_t spinlocks_used; + struct worker_spinlock spinlocks[WORKER_SPINLOCK_CONTENTION_FUNCTIONS]; + struct worker *next; struct worker *prev; }; @@ -56,9 +68,9 @@ static struct workers_globals { Pvoid_t worknames_JudyHS; size_t memory; -} workers_globals = { // workers globals, the base of all worknames +} workers_globals = { // workers globals, the base of all worknames .enabled = false, - .spinlock = NETDATA_SPINLOCK_INITIALIZER, // a lock for the worknames index + .spinlock = SPINLOCK_INITIALIZER, // a lock for the worknames index .worknames_JudyHS = NULL, // the worknames index }; @@ -239,30 +251,61 @@ void worker_set_metric(size_t job_id, NETDATA_DOUBLE value) { } } +// -------------------------------------------------------------------------------------------------------------------- + +static inline size_t pointer_hash_function(const char *func) { + uintptr_t addr = (uintptr_t)func; + return (size_t)(((addr >> 4) | (addr >> 16)) + func[0]) % WORKER_SPINLOCK_CONTENTION_FUNCTIONS; +} + +void worker_spinlock_contention(const char *func, size_t spins) { + if(unlikely(!worker)) + return; + + size_t hash = pointer_hash_function(func); + for (size_t i = 0; i < WORKER_SPINLOCK_CONTENTION_FUNCTIONS; i++) { + size_t slot = (hash + i) % WORKER_SPINLOCK_CONTENTION_FUNCTIONS; + if (worker->spinlocks[slot].function == func || worker->spinlocks[slot].function == NULL) { + // Either an empty slot or a matching slot + + worker->spinlocks[slot].function = func; + worker->spinlocks[slot].locks++; + worker->spinlocks[slot].spins += spins; + + return; + } + } + + // Array is full - do nothing +} + // statistics interface void workers_foreach(const char *name, void (*callback)( - void *data - , pid_t pid - , const char *thread_tag - , size_t max_job_id - , size_t utilization_usec - , size_t duration_usec - , size_t jobs_started, size_t is_running - , STRING **job_types_names - , STRING **job_types_units - , WORKER_METRIC_TYPE *job_metric_types - , size_t *job_types_jobs_started - , usec_t *job_types_busy_time - , NETDATA_DOUBLE *job_custom_values - ) - , void *data) { + void *data + , pid_t pid + , const char *thread_tag + , size_t max_job_id + , size_t utilization_usec + , size_t duration_usec + , size_t jobs_started, size_t is_running + , STRING **job_types_names + , STRING **job_types_units + , WORKER_METRIC_TYPE *job_metric_types + , size_t *job_types_jobs_started + , usec_t *job_types_busy_time + , NETDATA_DOUBLE *job_custom_values + , const char *spinlock_functions[] + , size_t *spinlock_locks + , size_t *spinlock_spins + ) + , void *data) { if(!workers_globals.enabled) return; spinlock_lock(&workers_globals.spinlock); usec_t busy_time, delta; - size_t i, jobs_started, jobs_running; + size_t jobs_started, jobs_running; size_t workname_size = strlen(name) + 1; struct workers_workname *workname; @@ -291,8 +334,12 @@ void workers_foreach(const char *name, void (*callback)( usec_t per_job_type_busy_time[WORKER_UTILIZATION_MAX_JOB_TYPES]; NETDATA_DOUBLE per_job_custom_values[WORKER_UTILIZATION_MAX_JOB_TYPES]; + const char *spinlock_functions[WORKER_SPINLOCK_CONTENTION_FUNCTIONS]; + size_t spinlock_locks[WORKER_SPINLOCK_CONTENTION_FUNCTIONS]; + size_t spinlock_spins[WORKER_SPINLOCK_CONTENTION_FUNCTIONS]; + size_t max_job_id = p->worker_max_job_id; - for(i = 0; i <= max_job_id ;i++) { + for(size_t i = 0; i <= max_job_id ;i++) { per_job_type_name[i] = p->per_job_type[i].name; per_job_type_units[i] = p->per_job_type[i].units; per_job_metric_type[i] = p->per_job_type[i].type; @@ -375,6 +422,34 @@ void workers_foreach(const char *name, void (*callback)( jobs_running = 1; } + // ------------------------------------------------------------------------------------------------------------ + // spinlock contention + + size_t t = 0; + for(size_t i = 0; i < WORKER_SPINLOCK_CONTENTION_FUNCTIONS ;i++) { + if(!p->spinlocks[i].function) continue; + + spinlock_functions[t] = p->spinlocks[i].function; + + size_t tmp = p->spinlocks[i].locks; + spinlock_locks[t] = tmp - p->spinlocks[i].statistics_last_locks; + p->spinlocks[i].statistics_last_locks = tmp; + + tmp = p->spinlocks[i].spins; + spinlock_spins[t] = tmp - p->spinlocks[i].statistics_last_spins; + p->spinlocks[i].statistics_last_spins = tmp; + + t++; + } + + for(; t < WORKER_SPINLOCK_CONTENTION_FUNCTIONS ;t++) { + spinlock_functions[t] = NULL; + spinlock_locks[t] = 0; + spinlock_spins[t] = 0; + } + + // ------------------------------------------------------------------------------------------------------------ + callback(data , p->pid , p->tag @@ -389,6 +464,9 @@ void workers_foreach(const char *name, void (*callback)( , per_job_type_jobs_started , per_job_type_busy_time , per_job_custom_values + , spinlock_functions + , spinlock_locks + , spinlock_spins ); } diff --git a/src/libnetdata/worker_utilization/worker_utilization.h b/src/libnetdata/worker_utilization/worker_utilization.h index 806b7647b9bc82..dd7f237e0cb129 100644 --- a/src/libnetdata/worker_utilization/worker_utilization.h +++ b/src/libnetdata/worker_utilization/worker_utilization.h @@ -6,6 +6,7 @@ // workers interfaces #define WORKER_UTILIZATION_MAX_JOB_TYPES 80 +#define WORKER_SPINLOCK_CONTENTION_FUNCTIONS 200 typedef enum __attribute__((packed)) { WORKER_METRIC_EMPTY = 0, @@ -25,25 +26,29 @@ void worker_unregister(void); void worker_is_idle(void); void worker_is_busy(size_t job_id); void worker_set_metric(size_t job_id, NETDATA_DOUBLE value); +void worker_spinlock_contention(const char *func, size_t spins); // statistics interface void workers_foreach(const char *name, void (*callback)( - void *data - , pid_t pid - , const char *thread_tag - , size_t max_job_id - , size_t utilization_usec - , size_t duration_usec - , size_t jobs_started - , size_t is_running - , STRING **job_types_names - , STRING **job_types_units - , WORKER_METRIC_TYPE *job_metric_types - , size_t *job_types_jobs_started - , usec_t *job_types_busy_time - , NETDATA_DOUBLE *job_custom_values - ) - , void *data); + void *data + , pid_t pid + , const char *thread_tag + , size_t max_job_id + , size_t utilization_usec + , size_t duration_usec + , size_t jobs_started + , size_t is_running + , STRING **job_types_names + , STRING **job_types_units + , WORKER_METRIC_TYPE *job_metric_types + , size_t *job_types_jobs_started + , usec_t *job_types_busy_time + , NETDATA_DOUBLE *job_custom_values + , const char *spinlock_functions[] + , size_t *spinlock_locks + , size_t *spinlock_spins + ) + , void *data); #endif // WORKER_UTILIZATION_H diff --git a/src/ml/ad_charts.cc b/src/ml/ad_charts.cc index abb9ea0ed5a65f..13e5b2e659061c 100644 --- a/src/ml/ad_charts.cc +++ b/src/ml/ad_charts.cc @@ -289,7 +289,7 @@ void ml_update_host_and_detection_rate_charts(ml_host_t *host, collected_number rrdset_flag_set(host->type_anomaly_rate_rs, RRDSET_FLAG_ANOMALY_DETECTION); } - spinlock_lock_cancelable(&host->type_anomaly_rate_spinlock); + spinlock_lock(&host->type_anomaly_rate_spinlock); for (auto &entry : host->type_anomaly_rate) { ml_type_anomaly_rate_t &type_anomaly_rate = entry.second; @@ -306,7 +306,7 @@ void ml_update_host_and_detection_rate_charts(ml_host_t *host, collected_number type_anomaly_rate.anomalous_dimensions = 0; type_anomaly_rate.normal_dimensions = 0; } - spinlock_unlock_cancelable(&host->type_anomaly_rate_spinlock); + spinlock_unlock(&host->type_anomaly_rate_spinlock); rrdset_done(host->type_anomaly_rate_rs); } @@ -409,8 +409,8 @@ void ml_update_training_statistics_chart(ml_worker_t *worker, const ml_queue_sta char id_buf[1024]; char name_buf[1024]; - snprintfz(id_buf, 1024, "training_queue_%zu_stats", worker->id); - snprintfz(name_buf, 1024, "training_queue_%zu_stats", worker->id); + snprintfz(id_buf, 1024, "training_queue_%zu_ops", worker->id); + snprintfz(name_buf, 1024, "training_queue_%zu_ops", worker->id); worker->queue_stats_rs = rrdset_create( localhost, @@ -418,9 +418,9 @@ void ml_update_training_statistics_chart(ml_worker_t *worker, const ml_queue_sta id_buf, // id name_buf, // name NETDATA_ML_CHART_FAMILY, // family - "netdata.queue_stats", // ctx - "Training queue stats", // title - "items", // units + "netdata.queue_ops", // ctx + "Training queue operations", // title + "count", // units NETDATA_ML_PLUGIN, // plugin NETDATA_ML_MODULE_TRAINING, // module NETDATA_ML_CHART_PRIO_QUEUE_STATS, // priority @@ -429,20 +429,67 @@ void ml_update_training_statistics_chart(ml_worker_t *worker, const ml_queue_sta ); rrdset_flag_set(worker->queue_stats_rs, RRDSET_FLAG_ANOMALY_DETECTION); - worker->queue_stats_queue_size_rd = - rrddim_add(worker->queue_stats_rs, "queue_size", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); - worker->queue_stats_popped_items_rd = - rrddim_add(worker->queue_stats_rs, "popped_items", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + worker->queue_stats_num_create_new_model_requests_rd = + rrddim_add(worker->queue_stats_rs, "pushed create model", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + worker->queue_stats_num_create_new_model_requests_completed_rd = + rrddim_add(worker->queue_stats_rs, "popped create model", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + + worker->queue_stats_num_add_existing_model_requests_rd = + rrddim_add(worker->queue_stats_rs, "pushed add model", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + + worker->queue_stats_num_add_existing_model_requests_completed_rd = + rrddim_add(worker->queue_stats_rs, "popped add models", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); } rrddim_set_by_pointer(worker->queue_stats_rs, - worker->queue_stats_queue_size_rd, stats.queue_size); + worker->queue_stats_num_create_new_model_requests_rd, stats.total_create_new_model_requests_pushed); rrddim_set_by_pointer(worker->queue_stats_rs, - worker->queue_stats_popped_items_rd, stats.num_popped_items); + worker->queue_stats_num_create_new_model_requests_completed_rd, stats.total_create_new_model_requests_popped); + + rrddim_set_by_pointer(worker->queue_stats_rs, + worker->queue_stats_num_add_existing_model_requests_rd, stats.total_add_existing_model_requests_pushed); + rrddim_set_by_pointer(worker->queue_stats_rs, + worker->queue_stats_num_add_existing_model_requests_completed_rd, stats.total_add_existing_model_requests_popped); rrdset_done(worker->queue_stats_rs); } + { + if (!worker->queue_size_rs) { + char id_buf[1024]; + char name_buf[1024]; + + snprintfz(id_buf, 1024, "training_queue_%zu_size", worker->id); + snprintfz(name_buf, 1024, "training_queue_%zu_size", worker->id); + + worker->queue_size_rs = rrdset_create( + localhost, + "netdata", // type + id_buf, // id + name_buf, // name + NETDATA_ML_CHART_FAMILY, // family + "netdata.queue_size", // ctx + "Training queue size", // title + "count", // units + NETDATA_ML_PLUGIN, // plugin + NETDATA_ML_MODULE_TRAINING, // module + NETDATA_ML_CHART_PRIO_QUEUE_STATS, // priority + localhost->rrd_update_every, // update_every + RRDSET_TYPE_LINE// chart_type + ); + rrdset_flag_set(worker->queue_size_rs, RRDSET_FLAG_ANOMALY_DETECTION); + + worker->queue_size_rd = + rrddim_add(worker->queue_size_rs, "items", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + } + + ml_queue_size_t qs = ml_queue_size(worker->queue); + collected_number cn = qs.add_exisiting_model + qs.create_new_model; + + rrddim_set_by_pointer(worker->queue_size_rs, worker->queue_size_rd, cn); + rrdset_done(worker->queue_size_rs); + } + /* * training stats */ @@ -462,7 +509,7 @@ void ml_update_training_statistics_chart(ml_worker_t *worker, const ml_queue_sta NETDATA_ML_CHART_FAMILY, // family "netdata.training_time_stats", // ctx "Training time stats", // title - "milliseconds", // units + "microseconds", // units NETDATA_ML_PLUGIN, // plugin NETDATA_ML_MODULE_TRAINING, // module NETDATA_ML_CHART_PRIO_TRAINING_TIME_STATS, // priority @@ -472,11 +519,11 @@ void ml_update_training_statistics_chart(ml_worker_t *worker, const ml_queue_sta rrdset_flag_set(worker->training_time_stats_rs, RRDSET_FLAG_ANOMALY_DETECTION); worker->training_time_stats_allotted_rd = - rrddim_add(worker->training_time_stats_rs, "allotted", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE); + rrddim_add(worker->training_time_stats_rs, "allotted", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); worker->training_time_stats_consumed_rd = - rrddim_add(worker->training_time_stats_rs, "consumed", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE); + rrddim_add(worker->training_time_stats_rs, "consumed", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); worker->training_time_stats_remaining_rd = - rrddim_add(worker->training_time_stats_rs, "remaining", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE); + rrddim_add(worker->training_time_stats_rs, "remaining", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); } rrddim_set_by_pointer(worker->training_time_stats_rs, @@ -518,15 +565,15 @@ void ml_update_training_statistics_chart(ml_worker_t *worker, const ml_queue_sta rrdset_flag_set(worker->training_results_rs, RRDSET_FLAG_ANOMALY_DETECTION); worker->training_results_ok_rd = - rrddim_add(worker->training_results_rs, "ok", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rrddim_add(worker->training_results_rs, "ok", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); worker->training_results_invalid_query_time_range_rd = - rrddim_add(worker->training_results_rs, "invalid-queries", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rrddim_add(worker->training_results_rs, "invalid-queries", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); worker->training_results_not_enough_collected_values_rd = - rrddim_add(worker->training_results_rs, "not-enough-values", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rrddim_add(worker->training_results_rs, "not-enough-values", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); worker->training_results_null_acquired_dimension_rd = - rrddim_add(worker->training_results_rs, "null-acquired-dimensions", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rrddim_add(worker->training_results_rs, "null-acquired-dimensions", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); worker->training_results_chart_under_replication_rd = - rrddim_add(worker->training_results_rs, "chart-under-replication", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rrddim_add(worker->training_results_rs, "chart-under-replication", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); } rrddim_set_by_pointer(worker->training_results_rs, diff --git a/src/ml/ml.cc b/src/ml/ml.cc index 46d046644f8603..d510090aa6ffc6 100644 --- a/src/ml/ml.cc +++ b/src/ml/ml.cc @@ -113,7 +113,7 @@ ml_dimension_calculated_numbers(ml_worker_t *worker, ml_dimension_t *dim, const } storage_engine_query_finalize(&handle); - telemetry_queries_ml_query_completed(/* points_read */ idx); + pulse_queries_ml_query_completed(/* points_read */ idx); training_response.total_values = idx; if (training_response.collected_values < min_n) { @@ -443,7 +443,7 @@ static void ml_dimension_serialize_kmeans(const ml_dimension_t *dim, BUFFER *wb) { RRDDIM *rd = dim->rd; - buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT); + buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_MINIFY); buffer_json_member_add_string(wb, "version", "1"); buffer_json_member_add_string(wb, "machine-guid", rd->rrdset->rrdhost->machine_guid); buffer_json_member_add_string(wb, "chart", rrdset_id(rd->rrdset)); @@ -550,7 +550,7 @@ ml_dimension_deserialize_kmeans(const char *json_str) ml_dimension_t *Dim = reinterpret_cast(AcqDim.dimension()); if (!Dim) { - telemetry_ml_models_ignored(); + pulse_ml_models_ignored(); return true; } @@ -571,7 +571,9 @@ static void ml_dimension_stream_kmeans(const ml_dimension_t *dim) if (!s) return; - if(!stream_sender_has_capabilities(dim->rd->rrdset->rrdhost, STREAM_CAP_ML_MODELS)) + if(!stream_sender_has_capabilities(dim->rd->rrdset->rrdhost, STREAM_CAP_ML_MODELS) || + !rrdset_check_upstream_exposed(dim->rd->rrdset) || + !rrddim_check_upstream_exposed(dim->rd)) return; CLEAN_BUFFER *payload = buffer_create(0, NULL); @@ -584,7 +586,7 @@ static void ml_dimension_stream_kmeans(const ml_dimension_t *dim) buffer_tostring(payload)); sender_commit_clean_buffer(s, wb, STREAM_TRAFFIC_TYPE_METADATA); - telemetry_ml_models_sent(); + pulse_ml_models_sent(); } static void ml_dimension_update_models(ml_worker_t *worker, ml_dimension_t *dim) @@ -830,7 +832,7 @@ ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t continue; if (anomaly_score < (100 * Cfg.dimension_anomaly_score_threshold)) { - telemetry_ml_models_consulted(models_consulted); + pulse_ml_models_consulted(models_consulted); spinlock_unlock(&dim->slock); return false; } @@ -847,7 +849,7 @@ ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t spinlock_unlock(&dim->slock); - telemetry_ml_models_consulted(models_consulted); + pulse_ml_models_consulted(models_consulted); return sum; } @@ -966,7 +968,7 @@ ml_host_detect_once(ml_host_t *host) host->mls.num_anomalous_dimensions += chart_mls.num_anomalous_dimensions; host->mls.num_normal_dimensions += chart_mls.num_normal_dimensions; - if (spinlock_trylock_cancelable(&host->type_anomaly_rate_spinlock)) + if (spinlock_trylock(&host->type_anomaly_rate_spinlock)) { STRING *key = rs->parts.type; auto &um = host->type_anomaly_rate; @@ -982,7 +984,7 @@ ml_host_detect_once(ml_host_t *host) it->second.anomalous_dimensions += chart_mls.num_anomalous_dimensions; it->second.normal_dimensions += chart_mls.num_normal_dimensions; - spinlock_unlock_cancelable(&host->type_anomaly_rate_spinlock); + spinlock_unlock(&host->type_anomaly_rate_spinlock); } } rrdset_foreach_done(rsp); @@ -1053,27 +1055,8 @@ ml_detect_main(void *arg) netdata_mutex_lock(&worker->nd_mutex); ml_queue_stats_t queue_stats = worker->queue_stats; - worker->queue_stats = {}; netdata_mutex_unlock(&worker->nd_mutex); - // calc the avg values - if (queue_stats.num_popped_items) { - queue_stats.queue_size /= queue_stats.num_popped_items; - queue_stats.allotted_ut /= queue_stats.num_popped_items; - queue_stats.consumed_ut /= queue_stats.num_popped_items; - queue_stats.remaining_ut /= queue_stats.num_popped_items; - } else { - queue_stats.queue_size = ml_queue_size(worker->queue); - queue_stats.consumed_ut = 0; - queue_stats.remaining_ut = queue_stats.allotted_ut; - - queue_stats.item_result_ok = 0; - queue_stats.item_result_invalid_query_time_range = 0; - queue_stats.item_result_not_enough_collected_values = 0; - queue_stats.item_result_null_acquired_dimension = 0; - queue_stats.item_result_chart_under_replication = 0; - } - ml_update_training_statistics_chart(worker, queue_stats); } } @@ -1163,13 +1146,13 @@ static enum ml_worker_result ml_worker_add_existing_model(ml_worker_t *worker, m ml_dimension_t *Dim = reinterpret_cast(AcqDim.dimension()); if (!Dim) { - telemetry_ml_models_ignored(); + pulse_ml_models_ignored(); return ML_WORKER_RESULT_OK; } Dim->kmeans = req.inlined_km; ml_dimension_update_models(worker, Dim); - telemetry_ml_models_received(); + pulse_ml_models_received(); return ML_WORKER_RESULT_OK; } @@ -1192,14 +1175,16 @@ void *ml_train_main(void *arg) { while (!Cfg.training_stop) { worker_is_busy(WORKER_TRAIN_QUEUE_POP); + ml_queue_stats_t loop_stats{}; + ml_queue_item_t item = ml_queue_pop(worker->queue); if (item.type == ML_QUEUE_ITEM_STOP_REQUEST) { break; } - size_t queue_size = ml_queue_size(worker->queue) + 1; + ml_queue_size_t queue_size = ml_queue_size(worker->queue); - usec_t allotted_ut = (Cfg.train_every * USEC_PER_SEC) / queue_size; + usec_t allotted_ut = (Cfg.train_every * USEC_PER_SEC) / (queue_size.create_new_model + 1); if (allotted_ut > USEC_PER_SEC) allotted_ut = USEC_PER_SEC; @@ -1230,44 +1215,73 @@ void *ml_train_main(void *arg) { if (Cfg.enable_statistics_charts) { worker_is_busy(WORKER_TRAIN_UPDATE_HOST); - netdata_mutex_lock(&worker->nd_mutex); + ml_queue_stats_t queue_stats = ml_queue_stats(worker->queue); - worker->queue_stats.queue_size += queue_size; - worker->queue_stats.num_popped_items += 1; + loop_stats.total_add_existing_model_requests_pushed = queue_stats.total_add_existing_model_requests_pushed; + loop_stats.total_add_existing_model_requests_popped = queue_stats.total_add_existing_model_requests_popped; + loop_stats.total_create_new_model_requests_pushed = queue_stats.total_create_new_model_requests_pushed; + loop_stats.total_create_new_model_requests_popped = queue_stats.total_create_new_model_requests_popped; - worker->queue_stats.allotted_ut += allotted_ut; - worker->queue_stats.consumed_ut += consumed_ut; - worker->queue_stats.remaining_ut += remaining_ut; + loop_stats.allotted_ut = allotted_ut; + loop_stats.consumed_ut = consumed_ut; + loop_stats.remaining_ut = remaining_ut; switch (worker_res) { case ML_WORKER_RESULT_OK: - worker->queue_stats.item_result_ok += 1; + loop_stats.item_result_ok = 1; break; case ML_WORKER_RESULT_INVALID_QUERY_TIME_RANGE: - worker->queue_stats.item_result_invalid_query_time_range += 1; + loop_stats.item_result_invalid_query_time_range = 1; break; case ML_WORKER_RESULT_NOT_ENOUGH_COLLECTED_VALUES: - worker->queue_stats.item_result_not_enough_collected_values += 1; + loop_stats.item_result_not_enough_collected_values = 1; break; case ML_WORKER_RESULT_NULL_ACQUIRED_DIMENSION: - worker->queue_stats.item_result_null_acquired_dimension += 1; + loop_stats.item_result_null_acquired_dimension = 1; break; case ML_WORKER_RESULT_CHART_UNDER_REPLICATION: - worker->queue_stats.item_result_chart_under_replication += 1; + loop_stats.item_result_chart_under_replication = 1; break; } + netdata_mutex_lock(&worker->nd_mutex); + + worker->queue_stats.total_add_existing_model_requests_pushed = loop_stats.total_add_existing_model_requests_pushed; + worker->queue_stats.total_add_existing_model_requests_popped = loop_stats.total_add_existing_model_requests_popped; + + worker->queue_stats.total_create_new_model_requests_pushed = loop_stats.total_create_new_model_requests_pushed; + worker->queue_stats.total_create_new_model_requests_popped = loop_stats.total_create_new_model_requests_popped; + + worker->queue_stats.allotted_ut += loop_stats.allotted_ut; + worker->queue_stats.consumed_ut += loop_stats.consumed_ut; + worker->queue_stats.remaining_ut += loop_stats.remaining_ut; + + worker->queue_stats.item_result_ok += loop_stats.item_result_ok; + worker->queue_stats.item_result_invalid_query_time_range += loop_stats.item_result_invalid_query_time_range; + worker->queue_stats.item_result_not_enough_collected_values += loop_stats.item_result_not_enough_collected_values; + worker->queue_stats.item_result_null_acquired_dimension += loop_stats.item_result_null_acquired_dimension; + worker->queue_stats.item_result_chart_under_replication += loop_stats.item_result_chart_under_replication; + netdata_mutex_unlock(&worker->nd_mutex); } + bool should_sleep = true; + if (worker->pending_model_info.size() >= Cfg.flush_models_batch_size) { worker_is_busy(WORKER_TRAIN_FLUSH_MODELS); netdata_mutex_lock(&db_mutex); ml_flush_pending_models(worker); netdata_mutex_unlock(&db_mutex); - continue; + should_sleep = false; } + if (item.type == ML_QUEUE_ITEM_TYPE_ADD_EXISTING_MODEL) { + should_sleep = false; + } + + if (!should_sleep) + continue; + worker_is_idle(); std::this_thread::sleep_for(std::chrono::microseconds{remaining_ut}); } diff --git a/src/ml/ml_memory.cc b/src/ml/ml_memory.cc index 321f59c69ede51..df087f52c8e5e1 100644 --- a/src/ml/ml_memory.cc +++ b/src/ml/ml_memory.cc @@ -1,7 +1,7 @@ #include #include -#include "daemon/telemetry/telemetry-ml.h" +#include "daemon/pulse/pulse-ml.h" void *operator new(size_t size) { @@ -9,7 +9,7 @@ void *operator new(size_t size) if (!ptr) throw std::bad_alloc(); - telemetry_ml_memory_allocated(size); + pulse_ml_memory_allocated(size); return ptr; } @@ -19,14 +19,14 @@ void *operator new[](size_t size) if (!ptr) throw std::bad_alloc(); - telemetry_ml_memory_allocated(size); + pulse_ml_memory_allocated(size); return ptr; } void operator delete(void *ptr, size_t size) noexcept { if (ptr) { - telemetry_ml_memory_freed(size); + pulse_ml_memory_freed(size); free(ptr); } } @@ -34,7 +34,7 @@ void operator delete(void *ptr, size_t size) noexcept void operator delete[](void *ptr, size_t size) noexcept { if (ptr) { - telemetry_ml_memory_freed(size); + pulse_ml_memory_freed(size); free(ptr); } } diff --git a/src/ml/ml_queue.cc b/src/ml/ml_queue.cc index 2e22987eed01da..91f8dd5cd5fe4e 100644 --- a/src/ml/ml_queue.cc +++ b/src/ml/ml_queue.cc @@ -1,8 +1,9 @@ // SPDX-License-Identifier: GPL-3.0-or-later +#include "ml/ml_queue.h" #include "ml_private.h" -ml_queue_t * ml_queue_init() +ml_queue_t *ml_queue_init() { ml_queue_t *q = new ml_queue_t(); @@ -22,7 +23,23 @@ void ml_queue_destroy(ml_queue_t *q) void ml_queue_push(ml_queue_t *q, const ml_queue_item_t req) { netdata_mutex_lock(&q->mutex); - q->internal.push(req); + + switch (req.type) { + case ML_QUEUE_ITEM_TYPE_CREATE_NEW_MODEL: + q->create_model_queue.push(req.create_new_model); + q->stats.total_create_new_model_requests_pushed += 1; + break; + + case ML_QUEUE_ITEM_TYPE_ADD_EXISTING_MODEL: + q->add_model_queue.push(req.add_existing_model); + q->stats.total_add_existing_model_requests_pushed += 1; + break; + + case ML_QUEUE_ITEM_STOP_REQUEST: + // Stop requests don't need to be queued + break; + } + pthread_cond_signal(&q->cond_var); netdata_mutex_unlock(&q->mutex); } @@ -34,7 +51,7 @@ ml_queue_item_t ml_queue_pop(ml_queue_t *q) ml_queue_item_t req; req.type = ML_QUEUE_ITEM_STOP_REQUEST; - while (q->internal.empty()) { + while (q->create_model_queue.empty() && q->add_model_queue.empty()) { pthread_cond_wait(&q->cond_var, &q->mutex); if (q->exit) { @@ -43,19 +60,33 @@ ml_queue_item_t ml_queue_pop(ml_queue_t *q) } } - req = q->internal.front(); - q->internal.pop(); + // Prioritize adding model requests + if (!q->add_model_queue.empty()) { + req.type = ML_QUEUE_ITEM_TYPE_ADD_EXISTING_MODEL; + req.add_existing_model = q->add_model_queue.front(); + q->add_model_queue.pop(); + q->stats.total_add_existing_model_requests_popped += 1; + } else if (!q->create_model_queue.empty()) { + req.type = ML_QUEUE_ITEM_TYPE_CREATE_NEW_MODEL; + req.create_new_model = q->create_model_queue.front(); + q->create_model_queue.pop(); + q->stats.total_create_new_model_requests_popped += 1; + } netdata_mutex_unlock(&q->mutex); return req; } -size_t ml_queue_size(ml_queue_t *q) +ml_queue_size_t ml_queue_size(ml_queue_t *q) { netdata_mutex_lock(&q->mutex); - size_t size = q->internal.size(); + ml_queue_size_t qs = ml_queue_size_t { + q->create_model_queue.size(), + q->add_model_queue.size(), + }; netdata_mutex_unlock(&q->mutex); - return size; + + return qs; } void ml_queue_signal(ml_queue_t *q) @@ -65,3 +96,12 @@ void ml_queue_signal(ml_queue_t *q) pthread_cond_signal(&q->cond_var); netdata_mutex_unlock(&q->mutex); } + +ml_queue_stats_t ml_queue_stats(ml_queue_t *q) +{ + netdata_mutex_lock(&q->mutex); + ml_queue_stats_t stats = q->stats; + netdata_mutex_unlock(&q->mutex); + + return stats; +} diff --git a/src/ml/ml_queue.h b/src/ml/ml_queue.h index 218771acbd56f5..a0f9d06b60d01b 100644 --- a/src/ml/ml_queue.h +++ b/src/ml/ml_queue.h @@ -32,16 +32,17 @@ typedef struct ml_queue_item { ml_request_add_existing_model add_existing_model; } ml_queue_item_t; -struct ml_queue_t { - std::queue internal; - netdata_mutex_t mutex; - pthread_cond_t cond_var; - std::atomic exit; -}; +typedef struct { + size_t create_new_model; + size_t add_exisiting_model; +} ml_queue_size_t; typedef struct { - size_t queue_size; - size_t num_popped_items; + size_t total_create_new_model_requests_pushed; + size_t total_create_new_model_requests_popped; + + size_t total_add_existing_model_requests_pushed; + size_t total_add_existing_model_requests_popped; usec_t allotted_ut; usec_t consumed_ut; @@ -54,6 +55,16 @@ typedef struct { size_t item_result_chart_under_replication; } ml_queue_stats_t; +struct ml_queue_t { + std::queue add_model_queue; + std::queue create_model_queue; + ml_queue_stats_t stats; + + netdata_mutex_t mutex; + pthread_cond_t cond_var; + std::atomic exit; +}; + ml_queue_t *ml_queue_init(); void ml_queue_destroy(ml_queue_t *q); @@ -62,7 +73,9 @@ void ml_queue_push(ml_queue_t *q, const ml_queue_item_t req); ml_queue_item_t ml_queue_pop(ml_queue_t *q); -size_t ml_queue_size(ml_queue_t *q); +ml_queue_size_t ml_queue_size(ml_queue_t *q); + +ml_queue_stats_t ml_queue_stats(ml_queue_t *q); void ml_queue_signal(ml_queue_t *q); diff --git a/src/ml/ml_worker.h b/src/ml/ml_worker.h index 2663ffebbe685d..f823e52783a8fd 100644 --- a/src/ml/ml_worker.h +++ b/src/ml/ml_worker.h @@ -25,8 +25,13 @@ typedef struct { std::vector pending_model_info; RRDSET *queue_stats_rs; - RRDDIM *queue_stats_queue_size_rd; - RRDDIM *queue_stats_popped_items_rd; + RRDDIM *queue_stats_num_create_new_model_requests_rd; + RRDDIM *queue_stats_num_add_existing_model_requests_rd; + RRDDIM *queue_stats_num_create_new_model_requests_completed_rd; + RRDDIM *queue_stats_num_add_existing_model_requests_completed_rd; + + RRDSET *queue_size_rs; + RRDDIM *queue_size_rd; RRDSET *training_time_stats_rs; RRDDIM *training_time_stats_allotted_rd; diff --git a/src/plugins.d/plugins_d.c b/src/plugins.d/plugins_d.c index 09be1ffc651b7d..1b7e65cbe1d34d 100644 --- a/src/plugins.d/plugins_d.c +++ b/src/plugins.d/plugins_d.c @@ -76,7 +76,7 @@ static void pluginsd_worker_thread_cleanup(void *pptr) { spinlock_unlock(&cd->unsafe.spinlock); if (pi) - spawn_popen_kill(pi); + spawn_popen_kill(pi, 3 * MSEC_PER_SEC); } #define SERIAL_FAILURES_THRESHOLD 10 @@ -88,7 +88,7 @@ static void pluginsd_worker_thread_handle_success(struct plugind *cd) { if (likely(cd->serial_failures <= SERIAL_FAILURES_THRESHOLD)) { netdata_log_info("PLUGINSD: 'host:%s', '%s' (pid %d) does not generate useful output but it reports success (exits with 0). %s.", - rrdhost_hostname(cd->host), cd->fullfilename, cd->unsafe.pid, + rrdhost_hostname(cd->host), string2str(cd->fullfilename), cd->unsafe.pid, plugin_is_enabled(cd) ? "Waiting a bit before starting it again." : "Will not start it again - it is now disabled."); pluginsd_sleep(cd->update_every * 10); @@ -99,7 +99,7 @@ static void pluginsd_worker_thread_handle_success(struct plugind *cd) { netdata_log_error("PLUGINSD: 'host:'%s', '%s' (pid %d) does not generate useful output, " "although it reports success (exits with 0)." "We have tried to collect something %zu times - unsuccessfully. Disabling it.", - rrdhost_hostname(cd->host), cd->fullfilename, cd->unsafe.pid, cd->serial_failures); + rrdhost_hostname(cd->host), string2str(cd->fullfilename), cd->unsafe.pid, cd->serial_failures); plugin_set_disabled(cd); return; } @@ -108,21 +108,21 @@ static void pluginsd_worker_thread_handle_success(struct plugind *cd) { static void pluginsd_worker_thread_handle_error(struct plugind *cd, int worker_ret_code) { if (worker_ret_code == -1) { netdata_log_info("PLUGINSD: 'host:%s', '%s' (pid %d) was killed with SIGTERM. Disabling it.", - rrdhost_hostname(cd->host), cd->fullfilename, cd->unsafe.pid); + rrdhost_hostname(cd->host), string2str(cd->fullfilename), cd->unsafe.pid); plugin_set_disabled(cd); return; } if (!cd->successful_collections) { netdata_log_error("PLUGINSD: 'host:%s', '%s' (pid %d) exited with error code %d and haven't collected any data. Disabling it.", - rrdhost_hostname(cd->host), cd->fullfilename, cd->unsafe.pid, worker_ret_code); + rrdhost_hostname(cd->host), string2str(cd->fullfilename), cd->unsafe.pid, worker_ret_code); plugin_set_disabled(cd); return; } if (cd->serial_failures <= SERIAL_FAILURES_THRESHOLD) { netdata_log_error("PLUGINSD: 'host:%s', '%s' (pid %d) exited with error code %d, but has given useful output in the past (%zu times). %s", - rrdhost_hostname(cd->host), cd->fullfilename, cd->unsafe.pid, worker_ret_code, cd->successful_collections, + rrdhost_hostname(cd->host), string2str(cd->fullfilename), cd->unsafe.pid, worker_ret_code, cd->successful_collections, plugin_is_enabled(cd) ? "Waiting a bit before starting it again." : "Will not start it again - it is disabled."); pluginsd_sleep(cd->update_every * 10); @@ -132,7 +132,7 @@ static void pluginsd_worker_thread_handle_error(struct plugind *cd, int worker_r if (cd->serial_failures > SERIAL_FAILURES_THRESHOLD) { netdata_log_error("PLUGINSD: 'host:%s', '%s' (pid %d) exited with error code %d, but has given useful output in the past (%zu times)." "We tried to restart it %zu times, but it failed to generate data. Disabling it.", - rrdhost_hostname(cd->host), cd->fullfilename, cd->unsafe.pid, worker_ret_code, + rrdhost_hostname(cd->host), string2str(cd->fullfilename), cd->unsafe.pid, worker_ret_code, cd->successful_collections, cd->serial_failures); plugin_set_disabled(cd); return; @@ -152,10 +152,10 @@ static void *pluginsd_worker_thread(void *arg) { size_t count = 0; while(service_running(SERVICE_COLLECTORS)) { - cd->unsafe.pi = spawn_popen_run(cd->cmd); + cd->unsafe.pi = spawn_popen_run(string2str(cd->cmd)); if(!cd->unsafe.pi) { netdata_log_error("PLUGINSD: 'host:%s', cannot popen(\"%s\", \"r\").", - rrdhost_hostname(cd->host), cd->cmd); + rrdhost_hostname(cd->host), string2str(cd->cmd)); break; } cd->unsafe.pid = spawn_popen_pid(cd->unsafe.pi); @@ -163,13 +163,13 @@ static void *pluginsd_worker_thread(void *arg) { nd_log(NDLS_DAEMON, NDLP_DEBUG, "PLUGINSD: 'host:%s' connected to '%s' running on pid %d", rrdhost_hostname(cd->host), - cd->fullfilename, cd->unsafe.pid); + string2str(cd->fullfilename), cd->unsafe.pid); - const char *plugin = strrchr(cd->fullfilename, '/'); + const char *plugin = strrchr(string2str(cd->fullfilename), '/'); if(plugin) plugin++; else - plugin = cd->fullfilename; + plugin = string2str(cd->fullfilename); char module[100]; snprintfz(module, sizeof(module), "plugins.d[%s]", plugin); @@ -188,9 +188,9 @@ static void *pluginsd_worker_thread(void *arg) { nd_log(NDLS_COLLECTORS, NDLP_WARNING, "PLUGINSD: 'host:%s', '%s' (pid %d) disconnected after %zu successful data collections.", - rrdhost_hostname(cd->host), cd->fullfilename, cd->unsafe.pid, count); + rrdhost_hostname(cd->host), string2str(cd->fullfilename), cd->unsafe.pid, count); - int worker_ret_code = spawn_popen_kill(cd->unsafe.pi); + int worker_ret_code = spawn_popen_kill(cd->unsafe.pi, 3 * MSEC_PER_SEC); cd->unsafe.pi = NULL; if(likely(worker_ret_code == 0)) @@ -218,7 +218,7 @@ static void pluginsd_main_cleanup(void *pptr) { spinlock_lock(&cd->unsafe.spinlock); if (cd->unsafe.enabled && cd->unsafe.running && cd->unsafe.thread != 0) { netdata_log_info("PLUGINSD: 'host:%s', stopping plugin thread: %s", - rrdhost_hostname(cd->host), cd->id); + rrdhost_hostname(cd->host), string2str(cd->id)); nd_thread_signal_cancel(cd->unsafe.thread); } @@ -321,11 +321,11 @@ void *pluginsd_main(void *ptr) { // check if it runs already struct plugind *cd; for (cd = pluginsd_root; cd; cd = cd->next) - if (unlikely(strcmp(cd->filename, file->d_name) == 0)) + if (unlikely(strcmp(string2str(cd->filename), file->d_name) == 0)) break; if (likely(cd && plugin_is_running(cd))) { - netdata_log_debug(D_PLUGINSD, "plugin '%s' is already running", cd->filename); + netdata_log_debug(D_PLUGINSD, "plugin '%s' is already running", string2str(cd->filename)); continue; } @@ -334,22 +334,42 @@ void *pluginsd_main(void *ptr) { if (unlikely(!cd)) { cd = callocz(sizeof(struct plugind), 1); - snprintfz(cd->id, CONFIG_MAX_NAME, "plugin:%s", pluginname); + { + char buf[CONFIG_MAX_NAME]; + snprintfz(buf, sizeof(buf), "plugin:%s", pluginname); + string_freez(cd->id); + cd->id = string_strdupz(buf); + } + + { + char buf[FILENAME_MAX + 1]; + strncpyz(buf, file->d_name, sizeof(buf) - 1); + string_freez(cd->filename); + cd->filename = string_strdupz(buf); - strncpyz(cd->filename, file->d_name, FILENAME_MAX); - snprintfz(cd->fullfilename, FILENAME_MAX, "%s/%s", directory_name, cd->filename); + snprintfz(buf, sizeof(buf), "%s/%s", directory_name, string2str(cd->filename)); + string_freez(cd->fullfilename); + cd->fullfilename = string_strdupz(buf); + } cd->host = localhost; cd->unsafe.enabled = enabled; cd->unsafe.running = false; - cd->update_every = (int)config_get_duration_seconds(cd->id, "update every", localhost->rrd_update_every); + cd->update_every = (int)config_get_duration_seconds(string2str(cd->id), "update every", localhost->rrd_update_every); cd->started_t = now_realtime_sec(); - char *def = ""; - snprintfz( - cd->cmd, PLUGINSD_CMD_MAX, "exec %s %d %s", cd->fullfilename, cd->update_every, - config_get(cd->id, "command options", def)); + { + const char *def = ""; + char buf[PLUGINSD_CMD_MAX + 1]; + + snprintfz( + buf, sizeof(buf), "exec %s %d %s", string2str(cd->fullfilename), + cd->update_every, config_get(string2str(cd->id), "command options", def)); + + string_freez(cd->cmd); + cd->cmd = string_strdupz(buf); + } // link it DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(pluginsd_root, cd, prev, next); diff --git a/src/plugins.d/plugins_d.h b/src/plugins.d/plugins_d.h index e400aeee43a1a3..5c8d54e723223e 100644 --- a/src/plugins.d/plugins_d.h +++ b/src/plugins.d/plugins_d.h @@ -14,11 +14,10 @@ struct rrdhost; extern char *plugin_directories[PLUGINSD_MAX_DIRECTORIES]; struct plugind { - char id[CONFIG_MAX_NAME+1]; // config node id - - char filename[FILENAME_MAX+1]; // just the filename - char fullfilename[FILENAME_MAX+1]; // with path - char cmd[PLUGINSD_CMD_MAX+1]; // the command that it executes + STRING *id; // config node id + STRING *filename; // just the filename + STRING *fullfilename; // with path + STRING *cmd; // the command that it executes size_t successful_collections; // the number of times we have seen // values collected from this plugin @@ -26,7 +25,7 @@ struct plugind { size_t serial_failures; // the number of times the plugin started // without collecting values - struct rrdhost *host; // the host the plugin collects data for + struct rrdhost *host; // the host the plugin collects data for int update_every; // the plugin default data collection frequency struct { diff --git a/src/plugins.d/pluginsd_functions.c b/src/plugins.d/pluginsd_functions.c index 26477a7dbd613b..8a3a3fee3b6834 100644 --- a/src/plugins.d/pluginsd_functions.c +++ b/src/plugins.d/pluginsd_functions.c @@ -51,7 +51,7 @@ static void inflight_functions_insert_callback(const DICTIONARY_ITEM *item, void // send the command to the plugin // IMPORTANT: make sure all commands are sent in 1 call, because in streaming they may interfere with others - ssize_t ret = send_to_plugin(buffer_tostring(buffer), parser); + ssize_t ret = send_to_plugin(buffer_tostring(buffer), parser, STREAM_TRAFFIC_TYPE_FUNCTIONS); pf->sent_monotonic_ut = now_monotonic_usec(); if(ret < 0) { @@ -155,7 +155,7 @@ static void pluginsd_function_cancel(void *data) { snprintfz(buffer, sizeof(buffer), PLUGINSD_CALL_FUNCTION_CANCEL " %s\n", transaction); // send the command to the plugin - ssize_t ret = send_to_plugin(buffer, t->parser); + ssize_t ret = send_to_plugin(buffer, t->parser, STREAM_TRAFFIC_TYPE_FUNCTIONS); if(ret < 0) sent = true; @@ -183,7 +183,7 @@ static void pluginsd_function_progress_to_plugin(void *data) { snprintfz(buffer, sizeof(buffer), PLUGINSD_CALL_FUNCTION_PROGRESS " %s\n", transaction); // send the command to the plugin - ssize_t ret = send_to_plugin(buffer, t->parser); + ssize_t ret = send_to_plugin(buffer, t->parser, STREAM_TRAFFIC_TYPE_FUNCTIONS); if(ret < 0) sent = true; diff --git a/src/plugins.d/pluginsd_internals.c b/src/plugins.d/pluginsd_internals.c index bc1cccc8ca43d1..8bf4f87a3fde7b 100644 --- a/src/plugins.d/pluginsd_internals.c +++ b/src/plugins.d/pluginsd_internals.c @@ -2,10 +2,13 @@ #include "pluginsd_internals.h" -ssize_t send_to_plugin(const char *txt, PARSER *parser) { +ssize_t send_to_plugin(const char *txt, PARSER *parser, STREAM_TRAFFIC_TYPE type) { if(!txt || !*txt || !parser) return 0; + if(parser->send_to_plugin_cb) + return parser->send_to_plugin_cb(txt, parser->send_to_plugin_data, type); + #ifdef ENABLE_H2O if(parser->h2o_ctx) return h2o_stream_write(parser->h2o_ctx, txt, strlen(txt)); diff --git a/src/plugins.d/pluginsd_internals.h b/src/plugins.d/pluginsd_internals.h index 26c1f18b90a9ff..0ab0ced0cdeb33 100644 --- a/src/plugins.d/pluginsd_internals.h +++ b/src/plugins.d/pluginsd_internals.h @@ -13,7 +13,7 @@ PARSER_RC PLUGINSD_DISABLE_PLUGIN(PARSER *parser, const char *keyword, const char *msg); -ssize_t send_to_plugin(const char *txt, PARSER *parser); +ssize_t send_to_plugin(const char *txt, PARSER *parser, STREAM_TRAFFIC_TYPE type); static inline RRDHOST *pluginsd_require_scope_host(PARSER *parser, const char *cmd) { RRDHOST *host = parser->user.host; @@ -37,16 +37,16 @@ static inline RRDSET *pluginsd_get_scope_chart(PARSER *parser) { return parser->user.st; } -static inline void pluginsd_lock_rrdset_data_collection(PARSER *parser) { +static inline void rrdset_data_collection_lock_with_trace(PARSER *parser, const char *func) { if(parser->user.st && !parser->user.v2.locked_data_collection) { - spinlock_lock(&parser->user.st->data_collection_lock); + spinlock_lock_with_trace(&parser->user.st->data_collection_lock, func); parser->user.v2.locked_data_collection = true; } } -static inline bool pluginsd_unlock_rrdset_data_collection(PARSER *parser) { +static inline bool rrdset_data_collection_unlock_with_trace(PARSER *parser, const char *func) { if(parser->user.st && parser->user.v2.locked_data_collection) { - spinlock_unlock(&parser->user.st->data_collection_lock); + spinlock_unlock_with_trace(&parser->user.st->data_collection_lock, func); parser->user.v2.locked_data_collection = false; return true; } @@ -54,8 +54,11 @@ static inline bool pluginsd_unlock_rrdset_data_collection(PARSER *parser) { return false; } -static inline void pluginsd_unlock_previous_scope_chart(PARSER *parser, const char *keyword, bool stale) { - if(unlikely(pluginsd_unlock_rrdset_data_collection(parser))) { +#define rrdset_data_collection_lock(parser) rrdset_data_collection_lock_with_trace(parser, __FUNCTION__) +#define rrdset_data_collection_unlock(parser) rrdset_data_collection_unlock_with_trace(parser, __FUNCTION__) + +static inline void rrdset_previous_scope_chart_unlock(PARSER *parser, const char *keyword, bool stale) { + if(unlikely(rrdset_data_collection_unlock(parser))) { if(stale) netdata_log_error("PLUGINSD: 'host:%s/chart:%s/' stale data collection lock found during %s; it has been unlocked", rrdhost_hostname(parser->user.st->rrdhost), @@ -76,7 +79,7 @@ static inline void pluginsd_unlock_previous_scope_chart(PARSER *parser, const ch } static inline void pluginsd_clear_scope_chart(PARSER *parser, const char *keyword) { - pluginsd_unlock_previous_scope_chart(parser, keyword, true); + rrdset_previous_scope_chart_unlock(parser, keyword, true); if(parser->user.cleanup_slots && parser->user.st) rrdset_pluginsd_receive_unslot(parser->user.st); diff --git a/src/plugins.d/pluginsd_parser.c b/src/plugins.d/pluginsd_parser.c index 7cfb483dab8711..f7eff639261426 100644 --- a/src/plugins.d/pluginsd_parser.c +++ b/src/plugins.d/pluginsd_parser.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "pluginsd_internals.h" +#include "streaming/replication.h" static inline PARSER_RC pluginsd_set(char **words, size_t num_words, PARSER *parser) { int idx = 1; @@ -327,7 +328,7 @@ static inline PARSER_RC pluginsd_chart(char **words, size_t num_words, PARSER *p st = rrdset_create( host, type, id, name, family, context, title, units, - (plugin && *plugin) ? plugin : parser->user.cd->filename, + (plugin && *plugin) ? plugin : string2str(parser->user.cd->filename), module, priority, update_every, chart_type); @@ -783,7 +784,7 @@ static inline PARSER_RC pluginsd_begin_v2(char **words, size_t num_words, PARSER // ------------------------------------------------------------------------ // prepare our state - pluginsd_lock_rrdset_data_collection(parser); + rrdset_data_collection_lock(parser); parser->user.v2.update_every = update_every; parser->user.v2.end_time = end_time; @@ -1034,7 +1035,7 @@ static inline PARSER_RC pluginsd_end_v2(char **words __maybe_unused, size_t num_ // ------------------------------------------------------------------------ // unblock data collection - pluginsd_unlock_previous_scope_chart(parser, PLUGINSD_KEYWORD_END_V2, false); + rrdset_previous_scope_chart_unlock(parser, PLUGINSD_KEYWORD_END_V2, false); rrdcontext_collected_rrdset(st); store_metric_collection_completed(); @@ -1222,6 +1223,7 @@ inline size_t pluginsd_process(RRDHOST *host, struct plugind *cd, int fd_input, CLEANUP_FUNCTION_REGISTER(pluginsd_process_thread_cleanup) cleanup_parser = parser; buffered_reader_init(&parser->reader); CLEAN_BUFFER *buffer = buffer_create(sizeof(parser->reader.read_buffer) + 2, NULL); + bool send_quit = true; while(likely(service_running(SERVICE_COLLECTORS))) { if(unlikely(!buffered_reader_next_line(&parser->reader, buffer))) { @@ -1231,6 +1233,7 @@ inline size_t pluginsd_process(RRDHOST *host, struct plugind *cd, int fd_input, if(unlikely(ret != BUFFERED_READER_READ_OK)) { nd_log(NDLS_COLLECTORS, NDLP_INFO, "Buffered reader not OK"); + send_quit = false; break; } @@ -1244,6 +1247,9 @@ inline size_t pluginsd_process(RRDHOST *host, struct plugind *cd, int fd_input, buffer->buffer[0] = '\0'; } + if(send_quit) + send_to_plugin(PLUGINSD_CALL_QUIT, parser, STREAM_TRAFFIC_TYPE_METADATA); + cd->unsafe.enabled = parser->user.enabled; count = parser->user.data_collections_count; diff --git a/src/plugins.d/pluginsd_parser.h b/src/plugins.d/pluginsd_parser.h index c3542fef9994ae..08804245782944 100644 --- a/src/plugins.d/pluginsd_parser.h +++ b/src/plugins.d/pluginsd_parser.h @@ -94,6 +94,8 @@ typedef struct parser_user_object { } PARSER_USER_OBJECT; typedef void (*parser_deferred_action_t)(struct parser *parser, void *action_data); +struct parser; +typedef ssize_t (*send_to_plugin_callback_t)(const char *txt, void *data, STREAM_TRAFFIC_TYPE type); struct parser { uint8_t version; // Parser version @@ -102,6 +104,8 @@ struct parser { int fd_input; int fd_output; ND_SOCK *sock; + send_to_plugin_callback_t send_to_plugin_cb; + void *send_to_plugin_data; #ifdef ENABLE_H2O void *h2o_ctx; // if set we use h2o_stream functions to send data diff --git a/src/plugins.d/pluginsd_replication.c b/src/plugins.d/pluginsd_replication.c index 2c3ad6a57dca48..aab06576555f0b 100644 --- a/src/plugins.d/pluginsd_replication.c +++ b/src/plugins.d/pluginsd_replication.c @@ -2,6 +2,7 @@ #include "pluginsd_replication.h" #include "streaming/stream-receiver-internals.h" +#include "streaming/replication.h" PARSER_RC pluginsd_replay_begin(char **words, size_t num_words, PARSER *parser) { int idx = 1; diff --git a/src/streaming/protocol/command-nodeid.c b/src/streaming/protocol/command-nodeid.c index c6c7110c69582e..e154813473e943 100644 --- a/src/streaming/protocol/command-nodeid.c +++ b/src/streaming/protocol/command-nodeid.c @@ -34,7 +34,7 @@ void stream_receiver_send_node_and_claim_id_to_child(RRDHOST *host) { PLUGINSD_KEYWORD_NODE_ID " '%s' '%s' '%s'\n", claim_id.str, node_id_str, cloud_config_url_get()); - send_to_plugin(buf, __atomic_load_n(&host->receiver->thread.parser, __ATOMIC_RELAXED)); + send_to_plugin(buf, __atomic_load_n(&host->receiver->thread.parser, __ATOMIC_RELAXED), STREAM_TRAFFIC_TYPE_METADATA); } rrdhost_receiver_unlock(host); } @@ -74,7 +74,7 @@ void stream_sender_get_node_and_claim_id_from_parent(struct sender_state *s) { if(!UUIDiszero(s->host->node_id) && !UUIDeq(s->host->node_id, node_id)) { if(claimed) { - nd_log(NDLS_DAEMON, NDLP_ERR, + nd_log(NDLS_DAEMON, NDLP_WARNING, "STREAM %s [send to %s] parent reports different node id '%s', but we are claimed. Ignoring it.", rrdhost_hostname(s->host), s->connected_to, node_id_str ? node_id_str : "(unset)"); diff --git a/src/streaming/replication.c b/src/streaming/replication.c index 588601ceba79d0..f95fcaf28d1811 100644 --- a/src/streaming/replication.c +++ b/src/streaming/replication.c @@ -5,7 +5,6 @@ #include "replication.h" #include "Judy.h" -#define STREAMING_START_MAX_SENDER_BUFFER_PERCENTAGE_ALLOWED 50ULL #define MAX_REPLICATION_MESSAGE_PERCENT_SENDER_BUFFER 25ULL #define MAX_SENDER_BUFFER_PERCENTAGE_ALLOWED 50ULL #define MIN_SENDER_BUFFER_PERCENTAGE_ALLOWED 10ULL @@ -37,7 +36,7 @@ #define REQUESTS_AHEAD_PER_THREAD 1 // 1 = enable synchronous queries static struct replication_query_statistics replication_queries = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .queries_started = 0, .queries_finished = 0, .points_read = 0, @@ -770,7 +769,7 @@ static void replicate_log_request(struct replication_request_details *r, const c internal_error(true, #else nd_log_limit_static_global_var(erl, 1, 0); - nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR, + nd_log_limit(&erl, NDLS_DAEMON, NDLP_NOTICE, #endif "REPLAY ERROR: 'host:%s/chart:%s' child sent: " "db from %ld to %ld%s, wall clock time %ld, " @@ -837,7 +836,7 @@ static bool send_replay_chart_cmd(struct replication_request_details *r, const c rrdset_id(st), r->wanted.start_streaming ? "true" : "false", (unsigned long long)r->wanted.after, (unsigned long long)r->wanted.before); - ssize_t ret = r->caller.callback(buffer, r->caller.parser); + ssize_t ret = r->caller.callback(buffer, r->caller.parser, STREAM_TRAFFIC_TYPE_REPLICATION); if (ret < 0) { netdata_log_error("REPLAY ERROR: 'host:%s/chart:%s' failed to send replication request to child (error %zd)", rrdhost_hostname(r->host), rrdset_id(r->st), ret); @@ -962,7 +961,7 @@ bool replicate_chart_request(send_command callback, struct parser *parser, RRDHO r.wanted.after = 0; r.wanted.before = 0; r.wanted.start_streaming = true; - return send_replay_chart_cmd(&r, "empty replication request, wanted after computed bigger than wanted before", true); + return send_replay_chart_cmd(&r, "empty replication request, wanted 'after' computed bigger than wanted 'before'", true); } // the child should start streaming immediately if the wanted duration is small, or we reached the last entry of the child @@ -987,7 +986,7 @@ struct replication_request { time_t after; // the start time of the query (maybe zero) key for sorting (JudyL) time_t before; // the end time of the query (maybe zero) - usec_t sender_last_flush_ut; // the timestamp of the sender, at the time we indexed this request + usec_t sender_circular_buffer_since_ut; // the timestamp of the sender, at the time we indexed this request Word_t unique_id; // auto-increment, later requests have bigger bool start_streaming; // true, when the parent wants to send the rest of the data (before is overwritten) and enable normal streaming @@ -1051,7 +1050,7 @@ static struct replication_thread { } replication_globals = { .aral_rse = NULL, - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .unsafe = { .pending = 0, @@ -1434,7 +1433,7 @@ static void replication_request_delete_callback(const DICTIONARY_ITEM *item __ma } static bool sender_is_still_connected_for_this_request(struct replication_request *rq) { - return rq->sender_last_flush_ut == stream_sender_get_flush_time(rq->sender); + return rq->sender_circular_buffer_since_ut == stream_circular_buffer_get_since_ut(rq->sender->scb); } static bool replication_execute_request(struct replication_request *rq, bool workers) { @@ -1467,7 +1466,8 @@ static bool replication_execute_request(struct replication_request *rq, bool wor // send the replication data rq->q->rq = rq; replication_response_execute_and_finalize( - rq->q, (size_t)((unsigned long long)rq->sender->host->sender->sbuf.cb->max_size * MAX_REPLICATION_MESSAGE_PERCENT_SENDER_BUFFER / 100ULL), workers); + rq->q, + (size_t)((unsigned long long)stream_circular_buffer_get_max_size(rq->sender->scb) * MAX_REPLICATION_MESSAGE_PERCENT_SENDER_BUFFER / 100ULL), workers); rq->q = NULL; @@ -1496,7 +1496,7 @@ void replication_add_request(struct sender_state *sender, const char *chart_id, .after = after, .before = before, .start_streaming = start_streaming, - .sender_last_flush_ut = stream_sender_get_flush_time(sender), + .sender_circular_buffer_since_ut = stream_circular_buffer_get_since_ut(sender->scb), .indexed_in_judy = false, .not_indexed_buffer_full = false, .not_indexed_preprocessing = false, @@ -1505,10 +1505,6 @@ void replication_add_request(struct sender_state *sender, const char *chart_id, if(!sender->replication.oldest_request_after_t || rq.after < sender->replication.oldest_request_after_t) sender->replication.oldest_request_after_t = rq.after; -// if(start_streaming && rrdpush_sender_get_buffer_used_percent(sender) <= STREAMING_START_MAX_SENDER_BUFFER_PERCENTAGE_ALLOWED) -// replication_execute_request(&rq, false); -// -// else dictionary_set(sender->replication.requests, chart_id, &rq, sizeof(struct replication_request)); } @@ -1535,8 +1531,7 @@ void replication_cleanup_sender(struct sender_state *sender) { } void replication_recalculate_buffer_used_ratio_unsafe(struct sender_state *s) { - size_t available = cbuffer_available_size_unsafe(s->host->sender->sbuf.cb); - size_t percentage = (s->sbuf.cb->max_size - available) * 100 / s->sbuf.cb->max_size; + size_t percentage = stream_sender_get_buffer_used_percent(s->scb); if(unlikely(percentage > MAX_SENDER_BUFFER_PERCENTAGE_ALLOWED && !stream_sender_replication_buffer_full_get(s))) { stream_sender_replication_buffer_full_set(s, true); @@ -1568,8 +1563,6 @@ void replication_recalculate_buffer_used_ratio_unsafe(struct sender_state *s) { // replication_set_next_point_in_time(0, 0); replication_recursive_unlock(); } - - stream_sender_set_buffer_used_percent(s, percentage); } // ---------------------------------------------------------------------------- @@ -1775,7 +1768,7 @@ static int replication_pipeline_execute_next(void) { if(rq->found) { internal_fatal(rq->executed, "REPLAY FATAL: query has already been executed!"); - if (rq->sender_last_flush_ut != stream_sender_get_flush_time(rq->sender)) { + if (rq->sender_circular_buffer_since_ut != stream_circular_buffer_get_since_ut(rq->sender->scb)) { // the sender has reconnected since this request was queued, // we can safely throw it away, since the parent will resend it replication_response_cancel_and_finalize(rq->q); @@ -1887,7 +1880,7 @@ void *replication_thread_main(void *ptr) { int nodes = (int)dictionary_entries(rrdhost_root_index); int cpus = (int)get_netdata_cpus(); - int threads = MIN(cpus * 2 / 3, nodes / 5); + int threads = MIN(cpus * 1 / 3, nodes / 10); if (threads < 1) threads = 1; else if (threads > MAX_REPLICATION_THREADS) threads = MAX_REPLICATION_THREADS; diff --git a/src/streaming/replication.h b/src/streaming/replication.h index 592e6c19e65bb5..10d8fba3ac6122 100644 --- a/src/streaming/replication.h +++ b/src/streaming/replication.h @@ -4,6 +4,7 @@ #define REPLICATION_H #include "daemon/common.h" +#include "stream-circular-buffer.h" struct parser; @@ -19,7 +20,7 @@ struct replication_query_statistics replication_get_query_statistics(void); bool replicate_chart_response(RRDHOST *rh, RRDSET *rs, bool start_streaming, time_t after, time_t before); -typedef ssize_t (*send_command)(const char *txt, struct parser *parser); +typedef ssize_t (*send_command)(const char *txt, struct parser *parser, STREAM_TRAFFIC_TYPE type); bool replicate_chart_request(send_command callback, struct parser *parser, RRDHOST *rh, RRDSET *rs, diff --git a/src/streaming/rrdhost-status.c b/src/streaming/rrdhost-status.c index c830eb1d3b4abe..22ce3de89c7b22 100644 --- a/src/streaming/rrdhost-status.c +++ b/src/streaming/rrdhost-status.c @@ -205,10 +205,14 @@ void rrdhost_status(RRDHOST *host, time_t now, RRDHOST_STATUS *s) { s->stream.peers = nd_sock_socket_peers(&host->sender->sock); s->stream.ssl = nd_sock_is_ssl(&host->sender->sock); - memcpy(s->stream.sent_bytes_on_this_connection_per_type, - host->sender->thread.bytes_sent_by_type, - MIN(sizeof(s->stream.sent_bytes_on_this_connection_per_type), - sizeof(host->sender->thread.bytes_sent_by_type))); + { + STREAM_CIRCULAR_BUFFER_STATS *stats = stream_circular_buffer_stats_unsafe(host->sender->scb); + + memcpy( + s->stream.sent_bytes_on_this_connection_per_type, + stats->bytes_sent_by_type, + MIN(sizeof(s->stream.sent_bytes_on_this_connection_per_type), sizeof(stats->bytes_sent_by_type))); + } if (rrdhost_flag_check(host, RRDHOST_FLAG_STREAM_SENDER_CONNECTED)) { s->stream.hops = host->sender->hops; diff --git a/src/streaming/stream-circular-buffer.c b/src/streaming/stream-circular-buffer.c new file mode 100644 index 00000000000000..60386755f7acd7 --- /dev/null +++ b/src/streaming/stream-circular-buffer.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "stream.h" +#include "stream-sender-internals.h" + +#define STREAM_CIRCULAR_BUFFER_ADAPT_TO_TIMES_MAX_SIZE 3 + +struct stream_circular_buffer { + struct circular_buffer *cb; + STREAM_CIRCULAR_BUFFER_STATS stats; + usec_t last_reset_ut; + + struct { + // the current max size of the buffer + size_t max_size; + + // the current utilization of the buffer + size_t buffer_ratio; + + // the last time we flushed the buffer + // by monitoring this we can know if the system was reconnected + usec_t since_ut; + } atomic; +}; + +static inline void stream_circular_buffer_stats_update_unsafe(STREAM_CIRCULAR_BUFFER *scb) { + scb->stats.bytes_size = scb->cb->size; + scb->stats.bytes_max_size = scb->cb->max_size; + scb->stats.bytes_outstanding = cbuffer_next_unsafe(scb->cb, NULL); + scb->stats.bytes_available = cbuffer_available_size_unsafe(scb->cb); + scb->stats.buffer_ratio = (double)(scb->cb->max_size - scb->stats.bytes_available) * 100.0 / (double)scb->cb->max_size; + + __atomic_store_n(&((scb)->atomic.buffer_ratio), (size_t)round(scb->stats.buffer_ratio), __ATOMIC_RELAXED); +} + +STREAM_CIRCULAR_BUFFER *stream_circular_buffer_create(void) { + STREAM_CIRCULAR_BUFFER *scb = callocz(1, sizeof(*scb)); + scb->cb = cbuffer_new(CBUFFER_INITIAL_SIZE, CBUFFER_INITIAL_MAX_SIZE, &netdata_buffers_statistics.cbuffers_streaming); + stream_circular_buffer_stats_update_unsafe(scb); + return scb; +} + +// returns true if it increased the buffer size +bool stream_circular_buffer_set_max_size_unsafe(STREAM_CIRCULAR_BUFFER *scb, size_t uncompressed_msg_size, bool force) { + size_t wanted = uncompressed_msg_size * STREAM_CIRCULAR_BUFFER_ADAPT_TO_TIMES_MAX_SIZE; + if(force || scb->cb->max_size < wanted) { + scb->cb->max_size = wanted; + scb->stats.bytes_max_size = scb->cb->max_size; + __atomic_store_n(&scb->atomic.max_size, scb->cb->max_size, __ATOMIC_RELAXED); + stream_circular_buffer_stats_update_unsafe(scb); + return true; + } + + return false; +} + +void stream_circular_buffer_flush_unsafe(STREAM_CIRCULAR_BUFFER *scb, size_t buffer_max_size) { + __atomic_store_n(&((scb)->atomic.since_ut), now_monotonic_usec(), __ATOMIC_RELAXED); + + // flush the output buffer from any data it may have + cbuffer_flush(scb->cb); + memset(&scb->stats, 0, sizeof(scb->stats)); + stream_circular_buffer_set_max_size_unsafe(scb, buffer_max_size, true); + stream_circular_buffer_recreate_timed_unsafe(scb, now_monotonic_usec(), true); +} + +inline size_t stream_sender_get_buffer_used_percent(STREAM_CIRCULAR_BUFFER *scb) { + return __atomic_load_n(&((scb)->atomic.buffer_ratio), __ATOMIC_RELAXED); +} + +size_t stream_circular_buffer_get_max_size(STREAM_CIRCULAR_BUFFER *scb) { + return __atomic_load_n(&scb->atomic.max_size, __ATOMIC_RELAXED); +} + +void stream_circular_buffer_recreate_timed_unsafe(STREAM_CIRCULAR_BUFFER *scb, usec_t now_ut, bool force) { + if(!force && (scb->stats.bytes_outstanding || now_ut - scb->last_reset_ut < 300 * USEC_PER_SEC)) + return; + + scb->last_reset_ut = now_ut; + + scb->stats.recreates++; // we increase even if we don't do it, to have sender_start() recreate its buffers + + if(scb->cb && scb->cb->size > CBUFFER_INITIAL_SIZE) { + cbuffer_free(scb->cb); + scb->cb = cbuffer_new(CBUFFER_INITIAL_SIZE, stream_send.buffer_max_size, &netdata_buffers_statistics.cbuffers_streaming); + } +} + +inline usec_t stream_circular_buffer_get_since_ut(STREAM_CIRCULAR_BUFFER *scb) { + return __atomic_load_n(&((scb)->atomic.since_ut), __ATOMIC_RELAXED); +} + +void stream_circular_buffer_destroy(STREAM_CIRCULAR_BUFFER *scb) { + cbuffer_free(scb->cb); + freez(scb); +} + +// adds data to the circular buffer, returns false when it can't (buffer is full) +bool stream_circular_buffer_add_unsafe(STREAM_CIRCULAR_BUFFER *scb, const char *data, size_t bytes_actual, size_t bytes_uncompressed, STREAM_TRAFFIC_TYPE type) { + scb->stats.adds++; + scb->stats.bytes_added += bytes_actual; + scb->stats.bytes_uncompressed += bytes_uncompressed; + scb->stats.bytes_sent_by_type[type] += bytes_actual; + bool rc = cbuffer_add_unsafe(scb->cb, data, bytes_actual) == 0; + if(rc) + stream_circular_buffer_stats_update_unsafe(scb); + return rc; +} + +// return the first available chunk at the beginning of the buffer +size_t stream_circular_buffer_get_unsafe(STREAM_CIRCULAR_BUFFER *scb, char **chunk) { + return cbuffer_next_unsafe(scb->cb, chunk); +} + +// removes data from the beginning of the circular buffer +void stream_circular_buffer_del_unsafe(STREAM_CIRCULAR_BUFFER *scb, size_t bytes) { + scb->stats.sends++; + scb->stats.bytes_sent += bytes; + cbuffer_remove_unsafe(scb->cb, bytes); + stream_circular_buffer_stats_update_unsafe(scb); +} + +// returns a copy of the current circular buffer statistics +STREAM_CIRCULAR_BUFFER_STATS *stream_circular_buffer_stats_unsafe(STREAM_CIRCULAR_BUFFER *scb) { + return &scb->stats; +} diff --git a/src/streaming/stream-circular-buffer.h b/src/streaming/stream-circular-buffer.h new file mode 100644 index 00000000000000..2b40898cd7a5e4 --- /dev/null +++ b/src/streaming/stream-circular-buffer.h @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_STREAM_CIRCULAR_BUFFER_H +#define NETDATA_STREAM_CIRCULAR_BUFFER_H + +#include "libnetdata/libnetdata.h" +#include "stream-traffic-types.h" + +#define CBUFFER_INITIAL_SIZE (16 * 1024) +#define CBUFFER_INITIAL_MAX_SIZE (10 * 1024 * 1024) +#define THREAD_BUFFER_INITIAL_SIZE (8192) + +typedef struct stream_circular_buffer_stats { + size_t adds; + size_t sends; + size_t recreates; + + size_t bytes_added; + size_t bytes_uncompressed; + size_t bytes_sent; + + uint32_t bytes_size; + uint32_t bytes_max_size; + uint32_t bytes_outstanding; + uint32_t bytes_available; + + double buffer_ratio; + + size_t bytes_sent_by_type[STREAM_TRAFFIC_TYPE_MAX]; +} STREAM_CIRCULAR_BUFFER_STATS; + +struct stream_circular_buffer; +typedef struct stream_circular_buffer STREAM_CIRCULAR_BUFFER; + +// -------------------------------------------------------------------------------------------------------------------- +// management + +STREAM_CIRCULAR_BUFFER *stream_circular_buffer_create(void); +void stream_circular_buffer_destroy(STREAM_CIRCULAR_BUFFER *scb); + +// flushes all data in the buffer +void stream_circular_buffer_flush_unsafe(STREAM_CIRCULAR_BUFFER *scb, size_t buffer_max_size); + +// recreates the buffer, but it does so every 5 minutes and only if the buffer has no data in it +// it does not alter the since_ut time of the buffer, so this is assumed to be the same session +// use this after deleting data from the buffer, to minimize the memory footprint of the buffer +void stream_circular_buffer_recreate_timed_unsafe(STREAM_CIRCULAR_BUFFER *scb, usec_t now_ut, bool force); + +// returns true if it increased the buffer size +// if it changes the size, it updates the statistics +bool stream_circular_buffer_set_max_size_unsafe(STREAM_CIRCULAR_BUFFER *scb, size_t uncompressed_msg_size, bool force); + +// returns a pointer to the current circular buffer statistics +// copy it if you plan to use it without a lock +STREAM_CIRCULAR_BUFFER_STATS *stream_circular_buffer_stats_unsafe(STREAM_CIRCULAR_BUFFER *scb); + +// -------------------------------------------------------------------------------------------------------------------- +// atomic operations - no lock needed + +// returns the max size of the buffer in bytes +size_t stream_circular_buffer_get_max_size(STREAM_CIRCULAR_BUFFER *scb); + +// returns the current buffer used ratio +size_t stream_sender_get_buffer_used_percent(STREAM_CIRCULAR_BUFFER *scb); + +// return the monotonic timestamp of the last time the buffer was created +usec_t stream_circular_buffer_get_since_ut(STREAM_CIRCULAR_BUFFER *scb); + +// -------------------------------------------------------------------------------------------------------------------- +// data operations (add, get, remove data from/to the buffer) + +// adds data to the end of the circular buffer, returns false when it can't (buffer is full) +// it updates the statistics +bool stream_circular_buffer_add_unsafe(STREAM_CIRCULAR_BUFFER *scb, const char *data, size_t bytes_actual, size_t bytes_uncompressed, STREAM_TRAFFIC_TYPE type); + +// returns a pointer to the beginning of the buffer, and its size in bytes +size_t stream_circular_buffer_get_unsafe(STREAM_CIRCULAR_BUFFER *scb, char **chunk); + +// removes data from the beginning of circular buffer +// it updates the statistics +void stream_circular_buffer_del_unsafe(STREAM_CIRCULAR_BUFFER *scb, size_t bytes); + +#endif //NETDATA_STREAM_CIRCULAR_BUFFER_H diff --git a/src/streaming/stream-connector.c b/src/streaming/stream-connector.c index d3360f9a52a7e5..11f9872365f087 100644 --- a/src/streaming/stream-connector.c +++ b/src/streaming/stream-connector.c @@ -232,7 +232,7 @@ static int stream_connect_upgrade_prelude(RRDHOST *host __maybe_unused, struct s error_report("Missing \"connection\" header in reply"); goto err_cleanup; } - if (strncmp(hdr, CONN_UPGRADE_VAL, strlen(CONN_UPGRADE_VAL))) { + if (strncmp(hdr, CONN_UPGRADE_VAL, strlen(CONN_UPGRADE_VAL)) != 0) { error_report("Expected \"connection: " CONN_UPGRADE_VAL "\""); goto err_cleanup; } @@ -242,7 +242,7 @@ static int stream_connect_upgrade_prelude(RRDHOST *host __maybe_unused, struct s error_report("Missing \"upgrade\" header in reply"); goto err_cleanup; } - if (strncmp(hdr, NETDATA_STREAM_PROTO_NAME, strlen(NETDATA_STREAM_PROTO_NAME))) { + if (strncmp(hdr, NETDATA_STREAM_PROTO_NAME, strlen(NETDATA_STREAM_PROTO_NAME)) != 0) { error_report("Expected \"upgrade: " NETDATA_STREAM_PROTO_NAME "\""); goto err_cleanup; } @@ -566,7 +566,7 @@ struct connector *stream_connector_get(struct sender_state *s) { if(s->connector.id < 0 || s->connector.id >= MAX_CONNECTORS) { // assign this to the dispatcher with fewer nodes - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; spinlock_lock(&spinlock); int min_slot = 0; size_t min_nodes = __atomic_load_n(&connector_globals.connectors[0].nodes, __ATOMIC_RELAXED); @@ -624,7 +624,6 @@ void stream_connector_add(struct sender_state *s) { stream_sender_unlock(s); nd_sock_close(&s->sock); - s->sbuf.cb->max_size = stream_send.buffer_max_size; s->parent_using_h2o = stream_send.parents.h2o; // do not call this with any locks held @@ -721,7 +720,7 @@ static void *stream_connector_thread(void *ptr) { } bool stream_connector_init(struct sender_state *s) { - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; if(!s) return false; spinlock_lock(&spinlock); diff --git a/src/streaming/stream-handshake.c b/src/streaming/stream-handshake.c index 1902603afffb67..8fe8f90b2b149a 100644 --- a/src/streaming/stream-handshake.c +++ b/src/streaming/stream-handshake.c @@ -32,15 +32,15 @@ static struct { {STREAM_HANDSHAKE_DISCONNECT_RECEIVER_LEFT, "DISCONNECTED RECEIVER LEFT"}, {STREAM_HANDSHAKE_DISCONNECT_ORPHAN_HOST, "DISCONNECTED ORPHAN HOST"}, {STREAM_HANDSHAKE_NON_STREAMABLE_HOST, "NON STREAMABLE HOST"}, - {STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_RECEIVER_READ_BUFFER, "DISCONNECTED NOT SUFFICIENT RCV READ BUFFER"}, + {STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_READ_BUFFER, "DISCONNECTED NOT SUFFICIENT READ BUFFER"}, {STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_SENDER_COMPRESSION_FAILED, "DISCONNECTED SND COMPRESSION FAILED"}, - {STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_SENDER_SEND_BUFFER, "DISCONNECTED NOT SUFFICIENT SEND BUFFER"}, + {STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_SEND_BUFFER, "DISCONNECTED NOT SUFFICIENT SEND BUFFER"}, {STREAM_HANDSHAKE_DISCONNECT_SOCKET_EOF, "DISCONNECTED SOCKET EOF"}, {STREAM_HANDSHAKE_DISCONNECT_SOCKET_READ_FAILED, "DISCONNECTED SOCKET READ FAILED"}, {STREAM_HANDSHAKE_DISCONNECT_SOCKET_TIMEOUT, "DISCONNECTED SOCKET TIMEOUT"}, {STREAM_HANDSHAKE_DISCONNECT_SOCKET_ERROR, "DISCONNECT SOCKET ERROR"}, {STREAM_HANDSHAKE_DISCONNECT_SOCKET_WRITE_FAILED, "DISCONNECTED SOCKET WRITE FAILED"}, - {STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_PARENT, "DISCONNECTED SOCKET CLOSED BY PARENT"}, + {STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END, "DISCONNECTED SOCKET CLOSED BY REMOTE END"}, {STREAM_HANDSHAKE_ERROR_HTTP_UPGRADE, "HTTP UPGRADE ERROR"}, {STREAM_HANDSHAKE_NO_HOST_IN_DESTINATION, "NO HOST IN DESTINATION - CONFIG ERROR"}, {STREAM_HANDSHAKE_CONNECT_TIMEOUT, "CONNECT TIMEOUT"}, diff --git a/src/streaming/stream-handshake.h b/src/streaming/stream-handshake.h index ca68688b741b33..3b4ddcbbe85c38 100644 --- a/src/streaming/stream-handshake.h +++ b/src/streaming/stream-handshake.h @@ -61,20 +61,23 @@ typedef enum { STREAM_HANDSHAKE_DISCONNECT_SHUTDOWN = -15, STREAM_HANDSHAKE_DISCONNECT_NETDATA_EXIT = -16, STREAM_HANDSHAKE_DISCONNECT_PARSER_EXIT = -17, + STREAM_HANDSHAKE_DISCONNECT_UNKNOWN_SOCKET_READ_ERROR = -18, STREAM_HANDSHAKE_DISCONNECT_PARSER_FAILED = -19, STREAM_HANDSHAKE_DISCONNECT_RECEIVER_LEFT = -20, STREAM_HANDSHAKE_DISCONNECT_ORPHAN_HOST = -21, STREAM_HANDSHAKE_NON_STREAMABLE_HOST = -22, - STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_RECEIVER_READ_BUFFER = -23, + + STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_READ_BUFFER = -23, + STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_SEND_BUFFER = -25, + STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_SENDER_COMPRESSION_FAILED = -24, - STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_SENDER_SEND_BUFFER = -25, STREAM_HANDSHAKE_DISCONNECT_SOCKET_EOF = -26, STREAM_HANDSHAKE_DISCONNECT_SOCKET_READ_FAILED = -27, STREAM_HANDSHAKE_DISCONNECT_SOCKET_TIMEOUT = -28, STREAM_HANDSHAKE_DISCONNECT_SOCKET_ERROR = -29, STREAM_HANDSHAKE_DISCONNECT_SOCKET_WRITE_FAILED = -30, - STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_PARENT = -31, + STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END = -31, STREAM_HANDSHAKE_ERROR_HTTP_UPGRADE = -32, STREAM_HANDSHAKE_NO_HOST_IN_DESTINATION = -33, STREAM_HANDSHAKE_CONNECT_TIMEOUT = -34, diff --git a/src/streaming/stream-parents.c b/src/streaming/stream-parents.c index c288431b806869..9ca78fc05d2940 100644 --- a/src/streaming/stream-parents.c +++ b/src/streaming/stream-parents.c @@ -54,7 +54,7 @@ struct blocked_parent { DEFINE_JUDYL_TYPED(BLOCKED_PARENTS, struct blocked_parent *); static BLOCKED_PARENTS_JudyLSet blocked_parents_set = { 0 }; -static RW_SPINLOCK blocked_parents_spinlock = NETDATA_RW_SPINLOCK_INITIALIZER; +static RW_SPINLOCK blocked_parents_spinlock = RW_SPINLOCK_INITIALIZER; static void block_parent_for_all_nodes(STREAM_PARENT *d, time_t duration_s) { rw_spinlock_write_lock(&blocked_parents_spinlock); @@ -195,7 +195,7 @@ void rrdhost_stream_parents_to_json(BUFFER *wb, RRDHOST_STATUS *s) { } void rrdhost_stream_parent_ssl_init(struct sender_state *s) { - static SPINLOCK sp = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK sp = SPINLOCK_INITIALIZER; spinlock_lock(&sp); if(netdata_ssl_streaming_sender_ctx || !s->host) { diff --git a/src/streaming/stream-path.c b/src/streaming/stream-path.c index c40a1d427d372b..451a3faf840bb9 100644 --- a/src/streaming/stream-path.c +++ b/src/streaming/stream-path.c @@ -241,7 +241,7 @@ void stream_path_send_to_child(RRDHOST *host) { CLEAN_BUFFER *wb = buffer_create(0, NULL); buffer_sprintf(wb, PLUGINSD_KEYWORD_JSON " " PLUGINSD_KEYWORD_JSON_CMD_STREAM_PATH "\n%s\n" PLUGINSD_KEYWORD_JSON_END "\n", buffer_tostring(payload)); - send_to_plugin(buffer_tostring(wb), __atomic_load_n(&host->receiver->thread.parser, __ATOMIC_RELAXED)); + send_to_plugin(buffer_tostring(wb), __atomic_load_n(&host->receiver->thread.parser, __ATOMIC_RELAXED), STREAM_TRAFFIC_TYPE_METADATA); } rrdhost_receiver_unlock(host); } diff --git a/src/streaming/stream-receiver-connection.c b/src/streaming/stream-receiver-connection.c index 66d6e273ce8560..fc3da73d4f53d5 100644 --- a/src/streaming/stream-receiver-connection.c +++ b/src/streaming/stream-receiver-connection.c @@ -57,6 +57,10 @@ void stream_receiver_free(struct receiver_state *rpt) { freez(rpt->program_name); freez(rpt->program_version); + freez(rpt->thread.compressed.buf); + rpt->thread.compressed.buf = NULL; + rpt->thread.compressed.size = 0; + string_freez(rpt->config.send.api_key); string_freez(rpt->config.send.parents); string_freez(rpt->config.send.charts_matching); @@ -261,6 +265,8 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ return stream_receiver_response_too_busy_now(w); struct receiver_state *rpt = callocz(1, sizeof(*rpt)); + rpt->thread.compressed.size = COMPRESSION_MAX_CHUNK; + rpt->thread.compressed.buf = mallocz(rpt->thread.compressed.size); rpt->connected_since_s = now_realtime_sec(); rpt->last_msg_t = now_monotonic_sec(); rpt->hops = 1; @@ -531,7 +537,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_ } if(unlikely(web_client_streaming_rate_t > 0)) { - static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + static SPINLOCK spinlock = SPINLOCK_INITIALIZER; static time_t last_stream_accepted_t = 0; time_t now = now_realtime_sec(); diff --git a/src/streaming/stream-receiver-internals.h b/src/streaming/stream-receiver-internals.h index 45ce322bbd330f..4210c78d18de76 100644 --- a/src/streaming/stream-receiver-internals.h +++ b/src/streaming/stream-receiver-internals.h @@ -47,10 +47,18 @@ struct receiver_state { bool enabled; size_t start; size_t used; - char buf[COMPRESSION_MAX_CHUNK * 2]; + size_t size; + char *buf; struct decompressor_state decompressor; } compressed; + struct { + SPINLOCK spinlock; + struct stream_opcode msg; + uint32_t msg_slot; + STREAM_CIRCULAR_BUFFER *scb; + } send_to_child; + struct pollfd_meta meta; } thread; @@ -80,5 +88,8 @@ void stream_receiver_log_status(struct receiver_state *rpt, const char *msg, con void stream_receiver_free(struct receiver_state *rpt); bool stream_receiver_signal_to_stop_and_wait(RRDHOST *host, STREAM_HANDSHAKE reason); +ssize_t send_to_child(const char *txt, void *data, STREAM_TRAFFIC_TYPE type); +void stream_receiver_send_opcode(struct receiver_state *rpt, struct stream_opcode msg); +void stream_receiver_handle_op(struct stream_thread *sth, struct receiver_state *rpt, struct stream_opcode *msg); #endif //NETDATA_STREAM_RECEIVER_INTERNALS_H diff --git a/src/streaming/stream-receiver.c b/src/streaming/stream-receiver.c index ea9ae6e065ef0b..57b8419668db9e 100644 --- a/src/streaming/stream-receiver.c +++ b/src/streaming/stream-receiver.c @@ -5,6 +5,8 @@ #include "stream-receiver-internals.h" #include "web/server/h2o/http_server.h" +static void stream_receiver_remove(struct stream_thread *sth, struct receiver_state *rpt, const char *why); + // When a child disconnects this is the maximum we will wait // before we update the cloud that the child is offline #define MAX_CHILD_DISC_DELAY (30000) @@ -28,59 +30,68 @@ static void streaming_receiver_disconnected(void) { // -------------------------------------------------------------------------------------------------------------------- -static inline ssize_t read_stream(struct receiver_state *r, char* buffer, size_t size) { +static bool stream_receiver_log_capabilities(BUFFER *wb, void *ptr) { + struct receiver_state *rpt = ptr; + if(!rpt) + return false; + + stream_capabilities_to_string(wb, rpt->capabilities); + return true; +} + +static bool stream_receiver_log_transport(BUFFER *wb, void *ptr) { + struct receiver_state *rpt = ptr; + if(!rpt) + return false; + + buffer_strcat(wb, nd_sock_is_ssl(&rpt->sock) ? "https" : "http"); + return true; +} + +// -------------------------------------------------------------------------------------------------------------------- + +static inline ssize_t write_stream(struct receiver_state *r, char* buffer, size_t size) { if(unlikely(!size)) { internal_error(true, "%s() asked to read zero bytes", __FUNCTION__); + errno_clear(); return -2; } #ifdef ENABLE_H2O if (is_h2o_rrdpush(r)) { - if(nd_thread_signaled_to_cancel()) + if(nd_thread_signaled_to_cancel()) { + errno_clear(); return -3; + } - return (ssize_t)h2o_stream_read(r->h2o_ctx, buffer, size); + return (ssize_t)h2o_stream_write(r->h2o_ctx, buffer, size); } #endif - ssize_t bytes_read = nd_sock_read(&r->sock, buffer, size, 0); - if(bytes_read <= 0) { - if (bytes_read == 0) - netdata_log_error("STREAM: %s(): EOF while reading data from socket!", __FUNCTION__); - else { - netdata_log_error("STREAM: %s() failed to read from socket!", __FUNCTION__); - bytes_read = -1; - } - } - - return bytes_read; + ssize_t bytes_written = nd_sock_send_nowait(&r->sock, buffer, size); + return bytes_written; } -static inline STREAM_HANDSHAKE read_stream_error_to_reason(ssize_t code) { - if(code > 0) - return 0; - - switch(code) { - case 0: - // EOF - return STREAM_HANDSHAKE_DISCONNECT_SOCKET_EOF; - - case -1: - // failed to read - return STREAM_HANDSHAKE_DISCONNECT_SOCKET_READ_FAILED; - - case -2: - // asked to read zero bytes - return STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_RECEIVER_READ_BUFFER; +static inline ssize_t read_stream(struct receiver_state *r, char* buffer, size_t size) { + if(unlikely(!size)) { + internal_error(true, "%s() asked to read zero bytes", __FUNCTION__); + errno_clear(); + return -2; + } - case -3: - // the thread is cancelled - return STREAM_HANDSHAKE_DISCONNECT_SHUTDOWN; +#ifdef ENABLE_H2O + if (is_h2o_rrdpush(r)) { + if(nd_thread_signaled_to_cancel()) { + errno_clear(); + return -3; + } - default: - // anything else - return STREAM_HANDSHAKE_DISCONNECT_UNKNOWN_SOCKET_READ_ERROR; + return (ssize_t)h2o_stream_read(r->h2o_ctx, buffer, size); } +#endif + + ssize_t bytes_read = nd_sock_revc_nowait(&r->sock, buffer, size); + return bytes_read; } // -------------------------------------------------------------------------------------------------------------------- @@ -200,7 +211,7 @@ static inline ssize_t receiver_read_compressed(struct receiver_state *r) { "%s: read_buffer does not start with zero #2", __FUNCTION__ ); ssize_t bytes_read = read_stream(r, r->thread.compressed.buf + r->thread.compressed.used, - sizeof(r->thread.compressed.buf) - r->thread.compressed.used); + r->thread.compressed.size - r->thread.compressed.used); if(bytes_read > 0) { r->thread.compressed.used += bytes_read; @@ -228,11 +239,73 @@ static inline bool receiver_should_stop(struct receiver_state *rpt) { // -------------------------------------------------------------------------------------------------------------------- +void stream_receiver_handle_op(struct stream_thread *sth, struct receiver_state *rpt, struct stream_opcode *msg) { + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_STR(NDF_NIDL_NODE, rpt->host->hostname), + ND_LOG_FIELD_TXT(NDF_SRC_IP, rpt->client_ip), + ND_LOG_FIELD_TXT(NDF_SRC_PORT, rpt->client_port), + ND_LOG_FIELD_CB(NDF_SRC_TRANSPORT, stream_receiver_log_transport, rpt), + ND_LOG_FIELD_CB(NDF_SRC_CAPABILITIES, stream_receiver_log_capabilities, rpt), + ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &streaming_to_parent_msgid), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + if(msg->opcode & STREAM_OPCODE_RECEIVER_BUFFER_OVERFLOW) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_OVERFLOW); + errno_clear(); + spinlock_lock(&rpt->thread.send_to_child.spinlock); + // copy the statistics + STREAM_CIRCULAR_BUFFER_STATS stats = *stream_circular_buffer_stats_unsafe(rpt->thread.send_to_child.scb); + spinlock_unlock(&rpt->thread.send_to_child.spinlock); + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM RECEIVE[%zu] %s [from %s]: send buffer is full (buffer size %u, max %u, used %u, available %u). " + "Restarting connection.", + sth->id, rrdhost_hostname(rpt->host), rpt->client_ip, + stats.bytes_size, stats.bytes_max_size, stats.bytes_outstanding, stats.bytes_available); + + stream_receiver_remove(sth, rpt, "receiver send buffer overflow"); + return; + } + + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM RECEIVE[%zu]: invalid msg id %u", sth->id, (unsigned)msg->opcode); +} + +ssize_t send_to_child(const char *txt, void *data, STREAM_TRAFFIC_TYPE type) { + struct receiver_state *rpt = data; + if(!rpt || rpt->thread.meta.type != POLLFD_TYPE_RECEIVER || !rpt->thread.send_to_child.scb) + return 0; + + spinlock_lock(&rpt->thread.send_to_child.spinlock); + STREAM_CIRCULAR_BUFFER *scb = rpt->thread.send_to_child.scb; + STREAM_CIRCULAR_BUFFER_STATS *stats = stream_circular_buffer_stats_unsafe(scb); + bool was_empty = stats->bytes_outstanding == 0; + struct stream_opcode msg = rpt->thread.send_to_child.msg; + msg.opcode = STREAM_OPCODE_NONE; + + size_t size = strlen(txt); + ssize_t rc = (ssize_t)size; + if(!stream_circular_buffer_add_unsafe(scb, txt, size, size, type)) { + msg.opcode = STREAM_OPCODE_RECEIVER_BUFFER_OVERFLOW; + rc = -1; + } + else if(was_empty) + msg.opcode = STREAM_OPCODE_RECEIVER_POLLOUT; + + spinlock_unlock(&rpt->thread.send_to_child.spinlock); + + if(msg.opcode != STREAM_OPCODE_NONE) + stream_receiver_send_opcode(rpt, msg); + + return rc; +} + static void streaming_parser_init(struct receiver_state *rpt) { rpt->thread.cd = (struct plugind){ .update_every = default_rrd_update_every, .unsafe = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .running = true, .enabled = true, }, @@ -240,10 +313,25 @@ static void streaming_parser_init(struct receiver_state *rpt) { }; // put the client IP and port into the buffers used by plugins.d - snprintfz(rpt->thread.cd.id, CONFIG_MAX_NAME, "%s:%s", rpt->client_ip, rpt->client_port); - snprintfz(rpt->thread.cd.filename, FILENAME_MAX, "%s:%s", rpt->client_ip, rpt->client_port); - snprintfz(rpt->thread.cd.fullfilename, FILENAME_MAX, "%s:%s", rpt->client_ip, rpt->client_port); - snprintfz(rpt->thread.cd.cmd, PLUGINSD_CMD_MAX, "%s:%s", rpt->client_ip, rpt->client_port); + { + char buf[CONFIG_MAX_NAME]; + snprintfz(buf, sizeof(buf), "%s:%s", rpt->client_ip, rpt->client_port); + string_freez(rpt->thread.cd.id); + rpt->thread.cd.id = string_strdupz(buf); + } + + { + char buf[FILENAME_MAX + 1]; + snprintfz(buf, sizeof(buf), "%s:%s", rpt->client_ip, rpt->client_port); + string_freez(rpt->thread.cd.filename); + rpt->thread.cd.filename = string_strdupz(buf); + + string_freez(rpt->thread.cd.fullfilename); + rpt->thread.cd.fullfilename = string_strdupz(buf); + + string_freez(rpt->thread.cd.cmd); + rpt->thread.cd.cmd = string_strdupz(buf); + } PARSER *parser = NULL; { @@ -257,6 +345,8 @@ static void streaming_parser_init(struct receiver_state *rpt) { }; parser = parser_init(&user, -1, -1, PARSER_INPUT_SPLIT, &rpt->sock); + parser->send_to_plugin_data = rpt; + parser->send_to_plugin_cb = send_to_child; } #ifdef ENABLE_H2O @@ -294,26 +384,6 @@ static void streaming_parser_init(struct receiver_state *rpt) { // -------------------------------------------------------------------------------------------------------------------- -static bool stream_receiver_log_capabilities(BUFFER *wb, void *ptr) { - struct receiver_state *rpt = ptr; - if(!rpt) - return false; - - stream_capabilities_to_string(wb, rpt->capabilities); - return true; -} - -static bool stream_receiver_log_transport(BUFFER *wb, void *ptr) { - struct receiver_state *rpt = ptr; - if(!rpt) - return false; - - buffer_strcat(wb, nd_sock_is_ssl(&rpt->sock) ? "https" : "http"); - return true; -} - -// -------------------------------------------------------------------------------------------------------------------- - void stream_receiver_move_queue_to_running_unsafe(struct stream_thread *sth) { internal_fatal(sth->tid != gettid_cached(), "Function %s() should only be used by the dispatcher thread", __FUNCTION__ ); @@ -337,22 +407,64 @@ void stream_receiver_move_queue_to_running_unsafe(struct stream_thread *sth) { "STREAM RECEIVE[%zu] [%s]: moving host from receiver queue to receiver running...", sth->id, rrdhost_hostname(rpt->host)); - internal_fatal(RECEIVERS_GET(&sth->rcv.receivers, (Word_t)rpt) != NULL, "Receiver to be added is already in the list of receivers"); - RECEIVERS_SET(&sth->rcv.receivers, (Word_t)rpt, rpt); - - streaming_parser_init(rpt); - rpt->host->stream.rcv.status.tid = gettid_cached(); rpt->thread.meta.type = POLLFD_TYPE_RECEIVER; rpt->thread.meta.rpt = rpt; + + spinlock_lock(&rpt->thread.send_to_child.spinlock); + rpt->thread.send_to_child.scb = stream_circular_buffer_create(); + + // this should be big enough to fit all the replies to the replication requests we may receive in a batch + stream_circular_buffer_set_max_size_unsafe(rpt->thread.send_to_child.scb, 100 * 1024 * 1024, true); + rpt->thread.send_to_child.msg.thread_slot = (int32_t)sth->id; + rpt->thread.send_to_child.msg.session = os_random32(); + rpt->thread.send_to_child.msg.meta = &rpt->thread.meta; + spinlock_unlock(&rpt->thread.send_to_child.spinlock); + + internal_fatal(META_GET(&sth->run.meta, (Word_t)&rpt->thread.meta) != NULL, "Receiver to be added is already in the list of receivers"); + META_SET(&sth->run.meta, (Word_t)&rpt->thread.meta, &rpt->thread.meta); + + if(sock_setnonblock(rpt->sock.fd) < 0) + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM RECEIVE '%s' [from [%s]:%s]: cannot set the non-blocking flag from socket %d", + rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port, rpt->sock.fd); + if(!nd_poll_add(sth->run.ndpl, rpt->sock.fd, ND_POLL_READ, &rpt->thread.meta)) - internal_fatal(true, "Failed to add receiver socket to nd_poll()"); + nd_log(NDLS_DAEMON, NDLP_ERR, "Failed to add receiver socket to nd_poll()"); + + // keep this last, since it sends commands back to the child + streaming_parser_init(rpt); } } -static void stream_receiver_on_disconnect(struct stream_thread *sth __maybe_unused, struct receiver_state *rpt) { +static void stream_receiver_remove(struct stream_thread *sth, struct receiver_state *rpt, const char *why) { internal_fatal(sth->tid != gettid_cached(), "Function %s() should only be used by the dispatcher thread", __FUNCTION__ ); - if(!rpt) return; + + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM RECEIVE[%zu] '%s' [from [%s]:%s]: " + "receiver disconnected: %s" + , sth->id + , rpt->hostname ? rpt->hostname : "-" + , rpt->client_ip ? rpt->client_ip : "-" + , rpt->client_port ? rpt->client_port : "-" + , why ? why : ""); + + internal_fatal(META_GET(&sth->run.meta, (Word_t)&rpt->thread.meta) == NULL, "Receiver to be removed is not found in the list of receivers"); + META_DEL(&sth->run.meta, (Word_t)&rpt->thread.meta); + + if(!nd_poll_del(sth->run.ndpl, rpt->sock.fd)) + nd_log(NDLS_DAEMON, NDLP_ERR, "Failed to delete receiver socket from nd_poll()"); + + rpt->host->stream.rcv.status.tid = 0; + + spinlock_lock(&rpt->thread.send_to_child.spinlock); + rpt->thread.send_to_child.msg.session = 0; + rpt->thread.send_to_child.msg.meta = NULL; + stream_circular_buffer_destroy(rpt->thread.send_to_child.scb); + rpt->thread.send_to_child.scb = NULL; + spinlock_unlock(&rpt->thread.send_to_child.spinlock); + + stream_thread_node_removed(rpt->host); buffer_free(rpt->thread.buffer); rpt->thread.buffer = NULL; @@ -388,162 +500,250 @@ static void stream_receiver_on_disconnect(struct stream_thread *sth __maybe_unus rrdhost_clear_receiver(rpt); rrdhost_set_is_parent_label(); + stream_receiver_free(rpt); + // DO NOT USE rpt after this point } -static void stream_receiver_remove(struct stream_thread *sth, struct receiver_state *rpt, const char *why) { - internal_fatal(sth->tid != gettid_cached(), "Function %s() should only be used by the dispatcher thread", __FUNCTION__ ); - - nd_log(NDLS_DAEMON, NDLP_ERR, - "STREAM RECEIVE[%zu] '%s' [from [%s]:%s]: " - "receiver disconnected: %s" - , sth->id - , rpt->hostname ? rpt->hostname : "-" - , rpt->client_ip ? rpt->client_ip : "-" - , rpt->client_port ? rpt->client_port : "-" - , why ? why : ""); +static ssize_t +stream_receive_and_process(struct stream_thread *sth, struct receiver_state *rpt, PARSER *parser, bool *removed) { + ssize_t rc; + if(rpt->thread.compressed.enabled) { + rc = receiver_read_compressed(rpt); + if(unlikely(rc <= 0)) + return rc; + + while(!nd_thread_signaled_to_cancel() && service_running(SERVICE_STREAMING) && !receiver_should_stop(rpt)) { + worker_is_busy(WORKER_STREAM_JOB_DECOMPRESS); + + // feed the decompressor with the new data we just read + decompressor_status_t feed_rc = receiver_feed_decompressor(rpt); + + if(likely(feed_rc == DECOMPRESS_OK)) { + while (true) { + // feed our uncompressed data buffer with new data + decompressor_status_t decompress_rc = receiver_get_decompressed(rpt); + + if (likely(decompress_rc == DECOMPRESS_OK)) { + // loop through all the complete lines found in the uncompressed buffer + + while (buffered_reader_next_line(&rpt->reader, rpt->thread.buffer)) { + if (unlikely(parser_action(parser, rpt->thread.buffer->buffer))) { + receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_PARSER_FAILED, false); + stream_receiver_remove(sth, rpt, "parser action failed"); + *removed = true; + return -1; + } - internal_fatal(RECEIVERS_GET(&sth->rcv.receivers, (Word_t)rpt) == NULL, "Receiver to be removed is not found in the list of receivers"); - RECEIVERS_DEL(&sth->rcv.receivers, (Word_t)rpt); - if(!nd_poll_del(sth->run.ndpl, rpt->sock.fd)) - internal_fatal(true, "Failed to remove receiver socket from nd_poll()"); + rpt->thread.buffer->len = 0; + rpt->thread.buffer->buffer[0] = '\0'; + } + } + else if (decompress_rc == DECOMPRESS_NEED_MORE_DATA) + break; + + else { + receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_PARSER_FAILED, false); + stream_receiver_remove(sth, rpt, "receiver decompressor failed"); + *removed = true; + return -1; + } + } + } + else if (feed_rc == DECOMPRESS_NEED_MORE_DATA) + break; + else { + receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_PARSER_FAILED, false); + stream_receiver_remove(sth, rpt, "receiver compressed data invalid"); + *removed = true; + return -1; + } + } - rpt->host->stream.rcv.status.tid = 0; + if(receiver_should_stop(rpt)) { + receiver_set_exit_reason(rpt, rpt->exit.reason, false); + stream_receiver_remove(sth, rpt, "received stop signal"); + *removed = true; + return -1; + } + } + else { + rc = receiver_read_uncompressed(rpt); + if(rc <= 0) return rc; + + while(buffered_reader_next_line(&rpt->reader, rpt->thread.buffer)) { + if(unlikely(parser_action(parser, rpt->thread.buffer->buffer))) { + receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_PARSER_FAILED, false); + stream_receiver_remove(sth, rpt, "parser action failed"); + *removed = true; + return -1; + } - stream_thread_node_removed(rpt->host); + rpt->thread.buffer->len = 0; + rpt->thread.buffer->buffer[0] = '\0'; + } + } - stream_receiver_on_disconnect(sth, rpt); - // DO NOT USE rpt after this point + return rc; } // process poll() events for streaming receivers -void stream_receive_process_poll_events(struct stream_thread *sth, struct receiver_state *rpt, nd_poll_event_t events __maybe_unused, usec_t now_ut) { - PARSER *parser = __atomic_load_n(&rpt->thread.parser, __ATOMIC_RELAXED); - ND_LOG_STACK lgs[] = { - ND_LOG_FIELD_TXT(NDF_SRC_IP, rpt->client_ip), - ND_LOG_FIELD_TXT(NDF_SRC_PORT, rpt->client_port), - ND_LOG_FIELD_TXT(NDF_NIDL_NODE, rpt->hostname), - ND_LOG_FIELD_CB(NDF_SRC_TRANSPORT, stream_receiver_log_transport, rpt), - ND_LOG_FIELD_CB(NDF_SRC_CAPABILITIES, stream_receiver_log_capabilities, rpt), - ND_LOG_FIELD_CB(NDF_REQUEST, line_splitter_reconstruct_line, &parser->line), - ND_LOG_FIELD_CB(NDF_NIDL_NODE, parser_reconstruct_node, parser), - ND_LOG_FIELD_CB(NDF_NIDL_INSTANCE, parser_reconstruct_instance, parser), - ND_LOG_FIELD_CB(NDF_NIDL_CONTEXT, parser_reconstruct_context, parser), - ND_LOG_FIELD_END(), - }; - ND_LOG_STACK_PUSH(lgs); +void stream_receive_process_poll_events(struct stream_thread *sth, struct receiver_state *rpt, nd_poll_event_t events, usec_t now_ut) +{ + internal_fatal( + sth->tid != gettid_cached(), "Function %s() should only be used by the dispatcher thread", __FUNCTION__); - if(receiver_should_stop(rpt)) { - receiver_set_exit_reason(rpt, rpt->exit.reason, false); - stream_receiver_remove(sth, rpt, "received stop signal"); - return; - } - - rpt->last_msg_t = (time_t)(now_ut / USEC_PER_SEC); + PARSER *parser = __atomic_load_n(&rpt->thread.parser, __ATOMIC_RELAXED); + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_TXT(NDF_SRC_IP, rpt->client_ip), + ND_LOG_FIELD_TXT(NDF_SRC_PORT, rpt->client_port), + ND_LOG_FIELD_TXT(NDF_NIDL_NODE, rpt->hostname), + ND_LOG_FIELD_CB(NDF_SRC_TRANSPORT, stream_receiver_log_transport, rpt), + ND_LOG_FIELD_CB(NDF_SRC_CAPABILITIES, stream_receiver_log_capabilities, rpt), + ND_LOG_FIELD_CB(NDF_REQUEST, line_splitter_reconstruct_line, &parser->line), + ND_LOG_FIELD_CB(NDF_NIDL_NODE, parser_reconstruct_node, parser), + ND_LOG_FIELD_CB(NDF_NIDL_INSTANCE, parser_reconstruct_instance, parser), + ND_LOG_FIELD_CB(NDF_NIDL_CONTEXT, parser_reconstruct_context, parser), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); - if(rpt->thread.compressed.enabled) { - worker_is_busy(WORKER_STREAM_JOB_SOCKET_RECEIVE); + if (receiver_should_stop(rpt)) { + receiver_set_exit_reason(rpt, rpt->exit.reason, false); + stream_receiver_remove(sth, rpt, "received stop signal"); + return; + } - ssize_t bytes = receiver_read_compressed(rpt); - if(unlikely(bytes <= 0)) { - if(bytes < 0 && (errno == EWOULDBLOCK || errno == EAGAIN || errno == EINTR)) - return; + if (unlikely(events & (ND_POLL_ERROR | ND_POLL_HUP | ND_POLL_INVALID))) { + // we have errors on this socket - worker_is_busy(WORKER_STREAM_JOB_SOCKET_ERROR); - receiver_set_exit_reason(rpt, read_stream_error_to_reason(bytes), false); - stream_receiver_remove(sth, rpt, "receiver socket read error"); - return; - } + worker_is_busy(WORKER_STREAM_JOB_SOCKET_ERROR); - bool node_removed = false; - while(!node_removed && !nd_thread_signaled_to_cancel() && service_running(SERVICE_STREAMING) && !receiver_should_stop(rpt)) { - worker_is_busy(WORKER_STREAM_JOB_DECOMPRESS); + char *error = "unknown error"; - // feed the decompressor with the new data we just read - decompressor_status_t feed = receiver_feed_decompressor(rpt); + if (events & ND_POLL_ERROR) + error = "socket reports errors"; + else if (events & ND_POLL_HUP) + error = "connection closed by remote end (HUP)"; + else if (events & ND_POLL_INVALID) + error = "connection is invalid"; - if(likely(feed == DECOMPRESS_OK)) { - while (!node_removed) { - // feed our uncompressed data buffer with new data - decompressor_status_t rc = receiver_get_decompressed(rpt); + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SOCKET_ERROR); - if (likely(rc == DECOMPRESS_OK)) { - // loop through all the complete lines found in the uncompressed buffer + nd_log( + NDLS_DAEMON, + NDLP_ERR, + "STREAM RECEIVE[%zu] %s [from %s]: %s - closing connection", + sth->id, + rrdhost_hostname(rpt->host), + rpt->client_ip, + error); - while (buffered_reader_next_line(&rpt->reader, rpt->thread.buffer)) { - if (unlikely(parser_action(parser, rpt->thread.buffer->buffer))) { - receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_PARSER_FAILED, false); - stream_receiver_remove(sth, rpt, "parser action failed"); - node_removed = true; - break; - } + receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_SOCKET_ERROR, false); + stream_receiver_remove(sth, rpt, error); + return; + } - rpt->thread.buffer->len = 0; - rpt->thread.buffer->buffer[0] = '\0'; - } - } - else if (rc == DECOMPRESS_NEED_MORE_DATA) - break; - - else { - receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_PARSER_FAILED, false); - stream_receiver_remove(sth, rpt, "receiver decompressor failed"); - node_removed = true; - break; - } - } + if (events & ND_POLL_WRITE) { + worker_is_busy(WORKER_STREAM_JOB_SOCKET_SEND); + + if (spinlock_trylock(&rpt->thread.send_to_child.spinlock)) { + const char *disconnect_reason = NULL; + STREAM_HANDSHAKE reason; + + char *chunk; + STREAM_CIRCULAR_BUFFER *scb = rpt->thread.send_to_child.scb; + STREAM_CIRCULAR_BUFFER_STATS *stats = stream_circular_buffer_stats_unsafe(scb); + size_t outstanding = stream_circular_buffer_get_unsafe(scb, &chunk); + ssize_t rc = write_stream(rpt, chunk, outstanding); + if (likely(rc > 0)) { + stream_circular_buffer_del_unsafe(scb, rc); + if (!stats->bytes_outstanding) { + if (!nd_poll_upd(sth->run.ndpl, rpt->sock.fd, ND_POLL_READ, &rpt->thread.meta)) + nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM RECEIVE: cannot update nd_poll()"); + + // recreate the circular buffer if we have to + stream_circular_buffer_recreate_timed_unsafe(rpt->thread.send_to_child.scb, now_ut, false); } - else if (feed == DECOMPRESS_NEED_MORE_DATA) - break; + } else if (rc == 0 || errno == ECONNRESET) { + disconnect_reason = "socket reports EOF (closed by child)"; + reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END; + } else if (rc < 0) { + if (errno == EWOULDBLOCK || errno == EAGAIN || errno == EINTR) + // will try later + ; else { - receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_PARSER_FAILED, false); - stream_receiver_remove(sth, rpt, "receiver compressed data invalid"); - node_removed = true; - break; + disconnect_reason = "socket reports error while writing"; + reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_WRITE_FAILED; } } - - if(!node_removed && receiver_should_stop(rpt)) { - receiver_set_exit_reason(rpt, rpt->exit.reason, false); - stream_receiver_remove(sth, rpt, "received stop signal"); + spinlock_unlock(&rpt->thread.send_to_child.spinlock); + + if (disconnect_reason) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR); + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM RECEIVE[%zu] %s [from %s]: %s (%zd, on fd %d) - closing connection - " + "we have sent %zu bytes in %zu operations.", + sth->id, rrdhost_hostname(rpt->host), rpt->client_ip, disconnect_reason, rc, rpt->sock.fd, + stats->bytes_sent, stats->sends); + + receiver_set_exit_reason(rpt, reason, false); + stream_receiver_remove(sth, rpt, disconnect_reason); return; } } - else { - worker_is_busy(WORKER_STREAM_JOB_SOCKET_RECEIVE); + } - ssize_t bytes = receiver_read_uncompressed(rpt); - if(unlikely(bytes <= 0)) { - if(bytes < 0 && (errno == EWOULDBLOCK || errno == EAGAIN || errno == EINTR)) - return; + if (!(events & ND_POLL_READ)) + return; - worker_is_busy(WORKER_STREAM_JOB_SOCKET_ERROR); - receiver_set_exit_reason(rpt, read_stream_error_to_reason(bytes), false); - stream_receiver_remove(sth, rpt, "socker read error"); - return; - } + // we can receive data from this socket - while(buffered_reader_next_line(&rpt->reader, rpt->thread.buffer)) { - if(unlikely(parser_action(parser, rpt->thread.buffer->buffer))) { - receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_PARSER_FAILED, false); - stream_receiver_remove(sth, rpt, "parser action failed"); - break; - } + worker_is_busy(WORKER_STREAM_JOB_SOCKET_RECEIVE); + while(true) { + bool removed = false; + ssize_t rc = stream_receive_and_process(sth, rpt, parser, &removed); + if (likely(rc > 0)) { + rpt->last_msg_t = (time_t)(now_ut / USEC_PER_SEC); + } + else if (rc == 0 || errno == ECONNRESET) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_REMOTE_CLOSED); + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM RECEIVE[%zu] %s [from %s]: socket %d reports EOF (closed by child).", + sth->id, rrdhost_hostname(rpt->host), rpt->client_ip, rpt->sock.fd); + receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END, false); + stream_receiver_remove(sth, rpt, "socket reports EOF (closed by child)"); + return; + } + else if (rc < 0) { + if(removed) + return; - rpt->thread.buffer->len = 0; - rpt->thread.buffer->buffer[0] = '\0'; + else if ((errno == EWOULDBLOCK || errno == EAGAIN || errno == EINTR)) + // will try later + break; + else { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR); + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM RECEIVE[%zu] %s [from %s]: error during receive (%zd, on fd %d) - closing connection.", + sth->id, rrdhost_hostname(rpt->host), rpt->client_ip, rc, rpt->sock.fd); + receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_SOCKET_READ_FAILED, false); + stream_receiver_remove(sth, rpt, "error during receive"); + return; } } + } } void stream_receiver_cleanup(struct stream_thread *sth) { Word_t idx = 0; - for(struct receiver_state *rpt = RECEIVERS_FIRST(&sth->rcv.receivers, &idx); - rpt; - rpt = RECEIVERS_NEXT(&sth->rcv.receivers, &idx)) + for(struct pollfd_meta *m = META_FIRST(&sth->run.meta, &idx); + m; + m = META_NEXT(&sth->run.meta, &idx)) { + if (m->type != POLLFD_TYPE_RECEIVER) continue; + struct receiver_state *rpt = m->rpt; stream_receiver_remove(sth, rpt, "shutdown"); - - RECEIVERS_FREE(&sth->rcv.receivers, NULL); + } } static void stream_receiver_replication_reset(RRDHOST *host) { diff --git a/src/streaming/stream-sender-api.c b/src/streaming/stream-sender-api.c index 6bcf791340efa0..a19964102c9edf 100644 --- a/src/streaming/stream-sender-api.c +++ b/src/streaming/stream-sender-api.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "stream-sender-internals.h" +#include "replication.h" bool stream_sender_has_capabilities(struct rrdhost *host, STREAM_CAPABILITIES capabilities) { return host && stream_has_capability(host->sender, capabilities); @@ -32,7 +33,7 @@ void stream_sender_structures_init(RRDHOST *host, bool stream, STRING *parents, host->sender->connector.id = -1; host->sender->host = host; - host->sender->sbuf.cb = cbuffer_new(CBUFFER_INITIAL_SIZE, CBUFFER_INITIAL_MAX_SIZE, &netdata_buffers_statistics.cbuffers_streaming); + host->sender->scb = stream_circular_buffer_create(); host->sender->capabilities = stream_our_capabilities(host, true); nd_sock_init(&host->sender->sock, netdata_ssl_streaming_sender_ctx, netdata_ssl_validate_certificate_sender); @@ -61,8 +62,8 @@ void stream_sender_structures_free(struct rrdhost *host) { // stop a possibly running thread stream_sender_signal_to_stop_and_wait(host, STREAM_HANDSHAKE_DISCONNECT_HOST_CLEANUP, true); - cbuffer_free(host->sender->sbuf.cb); - + stream_circular_buffer_destroy(host->sender->scb); + host->sender->scb = NULL; stream_compressor_destroy(&host->sender->compressor); replication_cleanup_sender(host->sender); @@ -111,7 +112,7 @@ void stream_sender_signal_to_stop_and_wait(struct rrdhost *host, STREAM_HANDSHAK msg.opcode = STREAM_OPCODE_SENDER_STOP_RECEIVER_LEFT; else msg.opcode = STREAM_OPCODE_SENDER_STOP_HOST_CLEANUP; - stream_sender_send_msg_to_dispatcher(host->sender, msg); + stream_sender_send_opcode(host->sender, msg); while(wait && rrdhost_flag_check(host, RRDHOST_FLAG_STREAM_SENDER_ADDED)) sleep_usec(10 * USEC_PER_MS); diff --git a/src/streaming/stream-sender-commit.c b/src/streaming/stream-sender-commit.c index 24405c6959f5d3..7ff34eb79bee3a 100644 --- a/src/streaming/stream-sender-commit.c +++ b/src/streaming/stream-sender-commit.c @@ -1,8 +1,7 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "stream-thread.h" - -#define SENDER_BUFFER_ADAPT_TO_TIMES_MAX_SIZE 3 +#include "replication.h" static __thread struct sender_buffer commit___thread = { 0 }; @@ -67,14 +66,12 @@ void sender_buffer_commit(struct sender_state *s, BUFFER *wb, struct sender_buff if (unlikely(!src || !src_len)) return; - size_t total_uncompressed_len = src_len; - size_t total_compressed_len = 0; - stream_sender_lock(s); // copy the sequence number of sender buffer recreates, while having our lock + STREAM_CIRCULAR_BUFFER_STATS *stats = stream_circular_buffer_stats_unsafe(s->scb); if(commit) - commit->sender_recreates = s->sbuf.recreates; + commit->sender_recreates = stats->recreates; if (!s->thread.msg.session) { // the dispatcher is not there anymore - ignore these data @@ -84,43 +81,14 @@ void sender_buffer_commit(struct sender_state *s, BUFFER *wb, struct sender_buff return; } - if (unlikely(s->sbuf.cb->max_size < (src_len + 1) * SENDER_BUFFER_ADAPT_TO_TIMES_MAX_SIZE)) { - // adaptive sizing of the circular buffer is needed to get this. - - nd_log( - NDLS_DAEMON, - NDLP_NOTICE, - "STREAM %s [send to %s]: max buffer size of %zu is too small " - "for a data message of size %zu. Increasing the max buffer size " - "to %d times the max data message size.", - rrdhost_hostname(s->host), - s->connected_to, - s->sbuf.cb->max_size, - buffer_strlen(wb) + 1, - SENDER_BUFFER_ADAPT_TO_TIMES_MAX_SIZE); - - s->sbuf.cb->max_size = (src_len + 1) * SENDER_BUFFER_ADAPT_TO_TIMES_MAX_SIZE; + if (unlikely(stream_circular_buffer_set_max_size_unsafe(s->scb, src_len, false))) { + // adaptive sizing of the circular buffer + nd_log(NDLS_DAEMON, NDLP_NOTICE, + "STREAM SEND %s [to %s]: Increased max buffer size to %u (message size %zu).", + rrdhost_hostname(s->host), s->connected_to, stats->bytes_max_size, buffer_strlen(wb) + 1); } -#ifdef NETDATA_LOG_STREAM_SENDER - if (type == STREAM_TRAFFIC_TYPE_METADATA) { - if (!s->stream_log_fp) { - char filename[FILENAME_MAX + 1]; - snprintfz( - filename, FILENAME_MAX, "/tmp/stream-sender-%s.txt", s->host ? rrdhost_hostname(s->host) : "unknown"); - - s->stream_log_fp = fopen(filename, "w"); - } - - fprintf( - s->stream_log_fp, - "\n--- SEND MESSAGE START: %s ----\n" - "%s" - "--- SEND MESSAGE END ----------------------------------------\n", - rrdhost_hostname(s->host), - src); - } -#endif + stream_sender_log_payload(s, wb, type, false); if (s->compressor.initialized) { // compressed traffic @@ -171,15 +139,13 @@ void sender_buffer_commit(struct sender_state *s, BUFFER *wb, struct sender_buff size_t decoded_dst_len = stream_decompress_decode_signature((const char *)&signature, sizeof(signature)); if (decoded_dst_len != dst_len) fatal( - "RRDPUSH COMPRESSION: invalid signature, original payload %zu bytes, " + "STREAM COMPRESSION: invalid signature, original payload %zu bytes, " "compressed payload length %zu bytes, but signature says payload is %zu bytes", size_to_compress, dst_len, decoded_dst_len); #endif - total_compressed_len += dst_len + sizeof(signature); - - if (cbuffer_add_unsafe(s->sbuf.cb, (const char *)&signature, sizeof(signature)) || - cbuffer_add_unsafe(s->sbuf.cb, dst, dst_len)) + if (!stream_circular_buffer_add_unsafe(s->scb, (const char *)&signature, sizeof(signature), sizeof(signature), type) || + !stream_circular_buffer_add_unsafe(s->scb, dst, dst_len, size_to_compress, type)) goto overflow_with_lock; src = src + size_to_compress; @@ -189,15 +155,12 @@ void sender_buffer_commit(struct sender_state *s, BUFFER *wb, struct sender_buff else { // uncompressed traffic - total_compressed_len = src_len; - - if (cbuffer_add_unsafe(s->sbuf.cb, src, src_len)) + if (!stream_circular_buffer_add_unsafe(s->scb, src, src_len, src_len, type)) goto overflow_with_lock; } - // update s->dispatcher entries - bool enable_sending = s->thread.bytes_outstanding == 0; - stream_sender_thread_data_added_data_unsafe(s, type, total_compressed_len, total_uncompressed_len); + bool enable_sending = stats->bytes_outstanding == 0; + replication_recalculate_buffer_used_ratio_unsafe(s); if (enable_sending) msg = s->thread.msg; @@ -206,33 +169,30 @@ void sender_buffer_commit(struct sender_state *s, BUFFER *wb, struct sender_buff if (enable_sending) { msg.opcode = STREAM_OPCODE_SENDER_POLLOUT; - stream_sender_send_msg_to_dispatcher(s, msg); + stream_sender_send_opcode(s, msg); } return; overflow_with_lock: { - size_t buffer_size = s->sbuf.cb->size; - size_t buffer_max_size = s->sbuf.cb->max_size; - size_t buffer_available = cbuffer_available_size_unsafe(s->sbuf.cb); msg = s->thread.msg; stream_sender_unlock(s); msg.opcode = STREAM_OPCODE_SENDER_BUFFER_OVERFLOW; - stream_sender_send_msg_to_dispatcher(s, msg); + stream_sender_send_opcode(s, msg); nd_log(NDLS_DAEMON, NDLP_ERR, - "STREAM %s [send to %s]: buffer overflow while adding %zu bytes (buffer size %zu, max size %zu, available %zu). " + "STREAM %s [send to %s]: buffer overflow (buffer size %u, max size %u, used %u, available %u). " "Restarting connection.", rrdhost_hostname(s->host), s->connected_to, - total_compressed_len, buffer_size, buffer_max_size, buffer_available); + stats->bytes_size, stats->bytes_max_size, stats->bytes_outstanding, stats->bytes_available); return; } compression_failed_with_lock: { - stream_compression_deactivate(s); + stream_compression_deactivate(s); msg = s->thread.msg; stream_sender_unlock(s); msg.opcode = STREAM_OPCODE_SENDER_RECONNECT_WITHOUT_COMPRESSION; - stream_sender_send_msg_to_dispatcher(s, msg); + stream_sender_send_opcode(s, msg); nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM %s [send to %s]: COMPRESSION failed (twice). Deactivating compression and restarting connection.", rrdhost_hostname(s->host), s->connected_to); diff --git a/src/streaming/stream-sender-execute.c b/src/streaming/stream-sender-execute.c index 69af2072f71fbf..783591f8faf7d1 100644 --- a/src/streaming/stream-sender-execute.c +++ b/src/streaming/stream-sender-execute.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "stream-thread.h" +#include "replication.h" struct inflight_stream_function { struct sender_state *sender; @@ -151,6 +152,11 @@ void stream_sender_execute_commands(struct sender_state *s) { }; ND_LOG_STACK_PUSH(lgs); +#ifdef NETDATA_LOG_STREAM_SENDER + if(!s->log.received) + s->log.received = buffer_create(0, NULL); +#endif + char *start = s->rbuf.b, *end = &s->rbuf.b[s->rbuf.read_len], *newline; *end = '\0'; for( ; start < end ; start = newline + 1) { @@ -169,6 +175,13 @@ void stream_sender_execute_commands(struct sender_state *s) { if(s->defer.end_keyword) { if(strcmp(start, s->defer.end_keyword) == 0) { +#ifdef NETDATA_LOG_STREAM_SENDER + buffer_strcat(s->log.received, buffer_tostring(s->defer.payload)); + buffer_strcat(s->log.received, "\n"); + buffer_strcat(s->log.received, s->defer.end_keyword); + buffer_strcat(s->log.received, "\n"); + stream_sender_log_payload(s, s->log.received, STREAM_TRAFFIC_TYPE_METADATA, true); +#endif s->defer.action(s, s->defer.action_data); cleanup_deferred_data(s); } @@ -180,10 +193,19 @@ void stream_sender_execute_commands(struct sender_state *s) { continue; } +#ifdef NETDATA_LOG_STREAM_SENDER + buffer_reset(s->log.received); + buffer_strcat(s->log.received, start); + buffer_strcat(s->log.received, "\n"); +#endif + s->rbuf.line.num_words = quoted_strings_splitter_whitespace(start, s->rbuf.line.words, PLUGINSD_MAX_WORDS); const char *command = get_word(s->rbuf.line.words, s->rbuf.line.num_words, 0); if(command && strcmp(command, PLUGINSD_CALL_FUNCTION) == 0) { +#ifdef NETDATA_LOG_STREAM_SENDER + stream_sender_log_payload(s, s->log.received, STREAM_TRAFFIC_TYPE_FUNCTIONS, true); +#endif char *transaction = get_word(s->rbuf.line.words, s->rbuf.line.num_words, 1); char *timeout_s = get_word(s->rbuf.line.words, s->rbuf.line.num_words, 2); char *function = get_word(s->rbuf.line.words, s->rbuf.line.num_words, 3); @@ -217,6 +239,9 @@ void stream_sender_execute_commands(struct sender_state *s) { } else if(command && strcmp(command, PLUGINSD_CALL_FUNCTION_CANCEL) == 0) { worker_is_busy(WORKER_SENDER_JOB_EXECUTE_FUNCTION); +#ifdef NETDATA_LOG_STREAM_SENDER + stream_sender_log_payload(s, s->log.received, STREAM_TRAFFIC_TYPE_FUNCTIONS, true); +#endif nd_log(NDLS_ACCESS, NDLP_DEBUG, NULL); char *transaction = get_word(s->rbuf.line.words, s->rbuf.line.num_words, 1); @@ -225,6 +250,9 @@ void stream_sender_execute_commands(struct sender_state *s) { } else if(command && strcmp(command, PLUGINSD_CALL_FUNCTION_PROGRESS) == 0) { worker_is_busy(WORKER_SENDER_JOB_EXECUTE_FUNCTION); +#ifdef NETDATA_LOG_STREAM_SENDER + stream_sender_log_payload(s, s->log.received, STREAM_TRAFFIC_TYPE_FUNCTIONS, true); +#endif nd_log(NDLS_ACCESS, NDLP_DEBUG, NULL); char *transaction = get_word(s->rbuf.line.words, s->rbuf.line.num_words, 1); @@ -233,6 +261,9 @@ void stream_sender_execute_commands(struct sender_state *s) { } else if (command && strcmp(command, PLUGINSD_KEYWORD_REPLAY_CHART) == 0) { worker_is_busy(WORKER_SENDER_JOB_EXECUTE_REPLAY); +#ifdef NETDATA_LOG_STREAM_SENDER + stream_sender_log_payload(s, s->log.received, STREAM_TRAFFIC_TYPE_REPLICATION, true); +#endif // do not log replication commands received - way too many! // nd_log(NDLS_ACCESS, NDLP_DEBUG, NULL); @@ -262,6 +293,9 @@ void stream_sender_execute_commands(struct sender_state *s) { } else if(command && strcmp(command, PLUGINSD_KEYWORD_NODE_ID) == 0) { worker_is_busy(WORKER_SENDER_JOB_EXECUTE_META); +#ifdef NETDATA_LOG_STREAM_SENDER + stream_sender_log_payload(s, s->log.received, STREAM_TRAFFIC_TYPE_METADATA, true); +#endif stream_sender_get_node_and_claim_id_from_parent(s); } else if(command && strcmp(command, PLUGINSD_KEYWORD_JSON) == 0) { diff --git a/src/streaming/stream-sender-internals.h b/src/streaming/stream-sender-internals.h index cb372d1f06655d..395feaac003979 100644 --- a/src/streaming/stream-sender-internals.h +++ b/src/streaming/stream-sender-internals.h @@ -8,6 +8,7 @@ #include "h2o-common.h" #include "aclk/https_client.h" #include "stream-parents.h" +#include "stream-circular-buffer.h" // connector thread #define WORKER_SENDER_CONNECTOR_JOB_CONNECTING 0 @@ -22,10 +23,6 @@ #define CONNECTED_TO_SIZE 100 -#define CBUFFER_INITIAL_SIZE (16 * 1024) -#define CBUFFER_INITIAL_MAX_SIZE (10 * 1024 * 1024) -#define THREAD_BUFFER_INITIAL_SIZE (CBUFFER_INITIAL_SIZE / 2) - #include "stream-compression/compression.h" #include "stream-conf.h" @@ -48,23 +45,7 @@ struct sender_state { // this is a property of stream_sender_send_msg_to_dispatcher() // protected by dispatcher->messages.spinlock // DO NOT READ OR WRITE ANYWHERE - uint32_t msg_slot; // ensures a dispatcher queue that can never get full - - // statistics about our compression efficiency - size_t bytes_compressed; - size_t bytes_uncompressed; - - // the current buffer statistics - // these SHOULD ALWAYS BE CALCULATED ON EVERY stream_sender_unlock() IF THE BUFFER WAS MODIFIED - // stream_sender_lock() IS REQUIRED TO READ/WRITE THESE - size_t bytes_outstanding; - size_t bytes_available; - NETDATA_DOUBLE buffer_ratio; - - // statistics about successful sends - size_t sends; - size_t bytes_sent; - size_t bytes_sent_by_type[STREAM_TRAFFIC_TYPE_MAX]; + uint32_t msg_slot; // ensures a opcode queue that can never get full usec_t last_traffic_ut; @@ -78,10 +59,7 @@ struct sender_state { char connected_to[CONNECTED_TO_SIZE + 1]; // We don't know which proxy we connect to, passed back from socket.c time_t last_state_since_t; // the timestamp of the last state (online/offline) change - struct { - struct circular_buffer *cb; - size_t recreates; - } sbuf; + STREAM_CIRCULAR_BUFFER *scb; struct { char b[PLUGINSD_LINE_MAX + 1]; @@ -92,7 +70,11 @@ struct sender_state { struct compressor_state compressor; #ifdef NETDATA_LOG_STREAM_SENDER - FILE *stream_log_fp; + struct { + SPINLOCK spinlock; + BUFFER *received; + FILE *fp; + } log; #endif struct { @@ -113,11 +95,6 @@ struct sender_state { } replication; - struct { - size_t buffer_used_percentage; // the current utilization of the sending buffer - usec_t last_flush_time_ut; // the last time the sender flushed the sending buffer in USEC - } atomic; - struct { const char *end_keyword; BUFFER *payload; @@ -131,16 +108,11 @@ struct sender_state { #define stream_sender_lock(sender) spinlock_lock(&(sender)->spinlock) #define stream_sender_unlock(sender) spinlock_unlock(&(sender)->spinlock) +#define stream_sender_trylock(sender) spinlock_trylock(&(sender)->spinlock) #define stream_sender_replication_buffer_full_set(sender, value) __atomic_store_n(&((sender)->replication.atomic.reached_max), value, __ATOMIC_SEQ_CST) #define stream_sender_replication_buffer_full_get(sender) __atomic_load_n(&((sender)->replication.atomic.reached_max), __ATOMIC_SEQ_CST) -#define stream_sender_set_buffer_used_percent(sender, value) __atomic_store_n(&((sender)->atomic.buffer_used_percentage), value, __ATOMIC_RELAXED) -#define stream_sender_get_buffer_used_percent(sender) __atomic_load_n(&((sender)->atomic.buffer_used_percentage), __ATOMIC_RELAXED) - -#define stream_sender_set_flush_time(sender) __atomic_store_n(&((sender)->atomic.last_flush_time_ut), now_realtime_usec(), __ATOMIC_RELAXED) -#define stream_sender_get_flush_time(sender) __atomic_load_n(&((sender)->atomic.last_flush_time_ut), __ATOMIC_RELAXED) - #define stream_sender_replicating_charts(sender) __atomic_load_n(&((sender)->replication.atomic.charts_replicating), __ATOMIC_RELAXED) #define stream_sender_replicating_charts_plus_one(sender) __atomic_add_fetch(&((sender)->replication.atomic.charts_replicating), 1, __ATOMIC_RELAXED) #define stream_sender_replicating_charts_minus_one(sender) __atomic_sub_fetch(&((sender)->replication.atomic.charts_replicating), 1, __ATOMIC_RELAXED) @@ -160,9 +132,7 @@ bool stream_connect(struct sender_state *s, uint16_t default_port, time_t timeou bool stream_sender_is_host_stopped(struct sender_state *s); -void stream_sender_send_msg_to_dispatcher(struct sender_state *s, struct stream_opcode msg); - -void stream_sender_thread_data_added_data_unsafe(struct sender_state *s, STREAM_TRAFFIC_TYPE type, uint64_t bytes_compressed, uint64_t bytes_uncompressed); +void stream_sender_send_opcode(struct sender_state *s, struct stream_opcode msg); void stream_sender_add_to_queue(struct sender_state *s); @@ -177,4 +147,10 @@ void stream_sender_on_connect(struct sender_state *s); void stream_sender_remove(struct sender_state *s); +#ifdef NETDATA_LOG_STREAM_SENDER +void stream_sender_log_payload(struct sender_state *s, BUFFER *payload, STREAM_TRAFFIC_TYPE type, bool inbound); +#else +#define stream_sender_log_payload(s, payload, type, inbound) debug_dummy() +#endif + #endif //NETDATA_STREAM_SENDER_INTERNALS_H diff --git a/src/streaming/stream-sender.c b/src/streaming/stream-sender.c index d69f3d491bb30a..34a60cc11c5b6b 100644 --- a/src/streaming/stream-sender.c +++ b/src/streaming/stream-sender.c @@ -2,44 +2,48 @@ #include "stream-thread.h" #include "stream-sender-internals.h" +#include "replication.h" static void stream_sender_move_running_to_connector_or_remove(struct stream_thread *sth, struct sender_state *s, STREAM_HANDSHAKE reason, bool reconnect); // -------------------------------------------------------------------------------------------------------------------- -static void stream_sender_cbuffer_recreate_timed_unsafe(struct sender_state *s, usec_t now_ut, bool force) { - static __thread usec_t last_reset_time_ut = 0; - - if(!force && now_ut - last_reset_time_ut < 300 * USEC_PER_SEC) - return; - - last_reset_time_ut = now_ut; +#ifdef NETDATA_LOG_STREAM_SENDER +void stream_sender_log_payload(struct sender_state *s, BUFFER *payload, STREAM_TRAFFIC_TYPE type __maybe_unused, bool inbound) { + spinlock_lock(&s->log.spinlock); - s->sbuf.recreates++; // we increase even if we don't do it, to have sender_start() recreate its buffers + if (!s->log.fp) { + char filename[FILENAME_MAX + 1]; + snprintfz( + filename, FILENAME_MAX, "/tmp/stream-sender-%s.txt", s->host ? rrdhost_hostname(s->host) : "unknown"); - if(s->sbuf.cb && s->sbuf.cb->size > CBUFFER_INITIAL_SIZE) { - cbuffer_free(s->sbuf.cb); - s->sbuf.cb = cbuffer_new(CBUFFER_INITIAL_SIZE, stream_send.buffer_max_size, &netdata_buffers_statistics.cbuffers_streaming); + s->log.fp = fopen(filename, "w"); } -} - -static void rrdpush_sender_cbuffer_flush(RRDHOST *host) { - stream_sender_set_flush_time(host->sender); - stream_sender_lock(host->sender); - - // flush the output buffer from any data it may have - cbuffer_flush(host->sender->sbuf.cb); - stream_sender_cbuffer_recreate_timed_unsafe(host->sender, now_monotonic_usec(), true); + if(inbound) { + fprintf( + s->log.fp, + "\n--- RECEIVE MESSAGE START: %s => %s ----\n" + "%s" + "--- RECEIVE MESSAGE END ----------------------------------------\n", + s->connected_to, rrdhost_hostname(s->host), buffer_tostring(payload)); + } + else { + fprintf( + s->log.fp, + "\n--- SEND MESSAGE START: %s => %s ----\n" + "%s" + "--- SEND MESSAGE END ----------------------------------------\n", + rrdhost_hostname(s->host), s->connected_to, buffer_tostring(payload)); + } - stream_sender_unlock(host->sender); + spinlock_unlock(&s->log.spinlock); } +#endif // -------------------------------------------------------------------------------------------------------------------- -static void rrdpush_sender_charts_and_replication_reset(struct sender_state *s) { - stream_sender_set_flush_time(s); - +static void stream_sender_charts_and_replication_reset(struct sender_state *s) { // stop all replication commands inflight replication_sender_delete_pending_requests(s); @@ -73,13 +77,14 @@ void stream_sender_on_connect(struct sender_state *s) { rrdhost_flag_set(s->host, RRDHOST_FLAG_STREAM_SENDER_CONNECTED); - rrdpush_sender_charts_and_replication_reset(s); - rrdpush_sender_cbuffer_flush(s->host); + stream_sender_charts_and_replication_reset(s); + + stream_sender_lock(s); + stream_circular_buffer_flush_unsafe(s->scb, stream_send.buffer_max_size); + stream_sender_unlock(s); s->thread.last_traffic_ut = now_monotonic_usec(); s->rbuf.read_len = 0; - s->sbuf.cb->read = 0; - s->sbuf.cb->write = 0; } static void stream_sender_on_ready_to_dispatch(struct sender_state *s) { @@ -103,8 +108,12 @@ static void stream_sender_on_disconnect(struct sender_state *s) { "STREAM SEND [%s]: running on-disconnect hooks...", rrdhost_hostname(s->host)); + stream_sender_lock(s); + stream_circular_buffer_flush_unsafe(s->scb, stream_send.buffer_max_size); + stream_sender_unlock(s); + stream_sender_execute_commands_cleanup(s); - rrdpush_sender_charts_and_replication_reset(s); + stream_sender_charts_and_replication_reset(s); stream_sender_clear_parent_claim_id(s->host); stream_receiver_send_node_and_claim_id_to_child(s->host); stream_path_parent_disconnected(s->host); @@ -150,42 +159,6 @@ static bool stream_sender_log_dst_port(BUFFER *wb, void *ptr) { return true; } -// -------------------------------------------------------------------------------------------------------------------- - -static void stream_sender_thread_data_reset_unsafe(struct sender_state *s) { - memset(s->thread.bytes_sent_by_type, 0, sizeof(s->thread.bytes_sent_by_type)); - - s->thread.bytes_uncompressed = 0; - s->thread.bytes_compressed = 0; - s->thread.bytes_outstanding = 0; - s->thread.bytes_available = 0; - s->thread.buffer_ratio = 0.0; - s->thread.sends = 0; - s->thread.bytes_sent = 0; - replication_recalculate_buffer_used_ratio_unsafe(s); -} - -static void stream_sender_thread_data_sent_data_unsafe(struct sender_state *s, uint64_t bytes_sent) { - s->thread.sends++; - s->thread.bytes_sent += bytes_sent; - s->thread.bytes_outstanding = cbuffer_next_unsafe(s->sbuf.cb, NULL); - s->thread.bytes_available = cbuffer_available_size_unsafe(s->sbuf.cb); - s->thread.buffer_ratio = (NETDATA_DOUBLE)(s->sbuf.cb->max_size - s->thread.bytes_available) * 100.0 / (NETDATA_DOUBLE)s->sbuf.cb->max_size; - replication_recalculate_buffer_used_ratio_unsafe(s); -} - -void stream_sender_thread_data_added_data_unsafe(struct sender_state *s, STREAM_TRAFFIC_TYPE type, uint64_t bytes_compressed, uint64_t bytes_uncompressed) { - // calculate the statistics for our dispatcher - s->thread.bytes_sent_by_type[type] += bytes_compressed; - - s->thread.bytes_uncompressed += bytes_uncompressed; - s->thread.bytes_compressed += bytes_compressed; - s->thread.bytes_outstanding = cbuffer_next_unsafe(s->sbuf.cb, NULL); - s->thread.bytes_available = cbuffer_available_size_unsafe(s->sbuf.cb); - s->thread.buffer_ratio = (NETDATA_DOUBLE)(s->sbuf.cb->max_size - s->thread.bytes_available) * 100.0 / (NETDATA_DOUBLE)s->sbuf.cb->max_size; - replication_recalculate_buffer_used_ratio_unsafe(s); -} - // -------------------------------------------------------------------------------------------------------------------- // opcodes @@ -195,7 +168,7 @@ void stream_sender_handle_op(struct stream_thread *sth, struct sender_state *s, ND_LOG_FIELD_CB(NDF_DST_IP, stream_sender_log_dst_ip, s), ND_LOG_FIELD_CB(NDF_DST_PORT, stream_sender_log_dst_port, s), ND_LOG_FIELD_CB(NDF_DST_TRANSPORT, stream_sender_log_transport, s), - ND_LOG_FIELD_CB(NDF_SRC_CAPABILITIES, stream_sender_log_capabilities, s), + ND_LOG_FIELD_CB(NDF_DST_CAPABILITIES, stream_sender_log_capabilities, s), ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &streaming_to_parent_msgid), ND_LOG_FIELD_END(), }; @@ -205,18 +178,17 @@ void stream_sender_handle_op(struct stream_thread *sth, struct sender_state *s, worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_OVERFLOW); errno_clear(); stream_sender_lock(s); - size_t buffer_size = s->sbuf.cb->size; - size_t buffer_max_size = s->sbuf.cb->max_size; - size_t buffer_available = cbuffer_available_size_unsafe(s->sbuf.cb); + // copy the statistics + STREAM_CIRCULAR_BUFFER_STATS stats = *stream_circular_buffer_stats_unsafe(s->scb); stream_sender_unlock(s); nd_log(NDLS_DAEMON, NDLP_ERR, - "STREAM SEND[%zu] %s [to %s]: send buffer is full (buffer size %zu, max %zu, available %zu). " + "STREAM SEND[%zu] %s [to %s]: send buffer is full (buffer size %u, max %u, used %u, available %u). " "Restarting connection.", sth->id, rrdhost_hostname(s->host), s->connected_to, - buffer_size, buffer_max_size, buffer_available); + stats.bytes_size, stats.bytes_max_size, stats.bytes_outstanding, stats.bytes_available); stream_sender_move_running_to_connector_or_remove( - sth, s, STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_SENDER_SEND_BUFFER, true); + sth, s, STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_SEND_BUFFER, true); return; } @@ -276,26 +248,28 @@ void stream_sender_move_queue_to_running_unsafe(struct stream_thread *sth) { "STREAM SEND[%zu] [%s]: moving host from dispatcher queue to dispatcher running...", sth->id, rrdhost_hostname(s->host)); - internal_fatal(SENDERS_GET(&sth->snd.senders, (Word_t)s) != NULL, "Sender already exists in senders list"); - SENDERS_SET(&sth->snd.senders, (Word_t)s, s); - stream_sender_lock(s); s->thread.meta.type = POLLFD_TYPE_SENDER; s->thread.meta.s = s; - if(!nd_poll_add(sth->run.ndpl, s->sock.fd, ND_POLL_READ, &s->thread.meta)) - internal_fatal(true, "Failed to add sender socket to nd_poll()"); s->thread.msg.thread_slot = (int32_t)sth->id; s->thread.msg.session = os_random32(); - s->thread.msg.sender = s; + s->thread.msg.meta = &s->thread.meta; s->host->stream.snd.status.tid = gettid_cached(); s->host->stream.snd.status.connections++; s->last_state_since_t = now_realtime_sec(); - stream_sender_thread_data_reset_unsafe(s); + stream_circular_buffer_flush_unsafe(s->scb, stream_send.buffer_max_size); + replication_recalculate_buffer_used_ratio_unsafe(s); stream_sender_unlock(s); + internal_fatal(META_GET(&sth->run.meta, (Word_t)&s->thread.meta) != NULL, "Sender already exists in meta list"); + META_SET(&sth->run.meta, (Word_t)&s->thread.meta, &s->thread.meta); + + if(!nd_poll_add(sth->run.ndpl, s->sock.fd, ND_POLL_READ, &s->thread.meta)) + nd_log(NDLS_DAEMON, NDLP_ERR, "Failed to add sender socket to nd_poll()"); + stream_sender_on_ready_to_dispatch(s); } } @@ -324,20 +298,25 @@ void stream_sender_remove(struct sender_state *s) { rrdhost_stream_parents_reset(s->host, STREAM_HANDSHAKE_EXITING); #ifdef NETDATA_LOG_STREAM_SENDER - if (s->stream_log_fp) { - fclose(s->stream_log_fp); - s->stream_log_fp = NULL; + spinlock_lock(&s->log.spinlock); + if (s->log.fp) { + fclose(s->log.fp); + s->log.fp = NULL; } + buffer_free(s->log.received); + s->log.received = NULL; + spinlock_unlock(&s->log.spinlock); #endif } static void stream_sender_move_running_to_connector_or_remove(struct stream_thread *sth, struct sender_state *s, STREAM_HANDSHAKE reason, bool reconnect) { internal_fatal(sth->tid != gettid_cached(), "Function %s() should only be used by the dispatcher thread", __FUNCTION__ ); - internal_fatal(SENDERS_GET(&sth->snd.senders, (Word_t)s) == NULL, "Sender to be removed is not in the list of senders"); - SENDERS_DEL(&sth->snd.senders, (Word_t)s); + internal_fatal(META_GET(&sth->run.meta, (Word_t)&s->thread.meta) == NULL, "Sender to be removed is not in the list of senders"); + META_DEL(&sth->run.meta, (Word_t)&s->thread.meta); + if(!nd_poll_del(sth->run.ndpl, s->sock.fd)) - internal_fatal(true, "Failed to remove sender socket from nd_poll()"); + nd_log(NDLS_DAEMON, NDLP_ERR, "Failed to delete sender socket from nd_poll()"); // clear this flag asap, to stop other threads from pushing metrics for this node rrdhost_flag_clear(s->host, RRDHOST_FLAG_STREAM_SENDER_CONNECTED | RRDHOST_FLAG_STREAM_SENDER_READY_4_METRICS); @@ -346,7 +325,7 @@ static void stream_sender_move_running_to_connector_or_remove(struct stream_thre stream_sender_lock(s); s->thread.msg.session = 0; - s->thread.msg.sender = NULL; + s->thread.msg.meta = NULL; s->host->stream.snd.status.tid = 0; stream_sender_unlock(s); @@ -378,19 +357,22 @@ void stream_sender_check_all_nodes_from_poll(struct stream_thread *sth, usec_t n NETDATA_DOUBLE overall_buffer_ratio = 0.0; Word_t idx = 0; - for(struct sender_state *s = SENDERS_FIRST(&sth->snd.senders, &idx); - s; - s = SENDERS_NEXT(&sth->snd.senders, &idx)) { + for(struct pollfd_meta *m = META_FIRST(&sth->run.meta, &idx); + m; + m = META_NEXT(&sth->run.meta, &idx)) { + if(m->type != POLLFD_TYPE_SENDER) continue; + struct sender_state *s = m->s; stream_sender_lock(s); - size_t outstanding = cbuffer_next_unsafe(s->sbuf.cb, NULL); - NETDATA_DOUBLE buffer_ratio = s->thread.buffer_ratio; + // copy the statistics + STREAM_CIRCULAR_BUFFER_STATS stats = *stream_circular_buffer_stats_unsafe(s->scb); stream_sender_unlock(s); - if (buffer_ratio > overall_buffer_ratio) - overall_buffer_ratio = buffer_ratio; + if (stats.buffer_ratio > overall_buffer_ratio) + overall_buffer_ratio = stats.buffer_ratio; - if(unlikely(s->thread.last_traffic_ut + stream_send.parents.timeout_s * USEC_PER_SEC < now_ut && + if(unlikely(stats.bytes_outstanding && + s->thread.last_traffic_ut + stream_send.parents.timeout_s * USEC_PER_SEC < now_ut && !stream_sender_pending_replication_requests(s) && !stream_sender_replicating_charts(s) )) { @@ -408,27 +390,29 @@ void stream_sender_check_all_nodes_from_poll(struct stream_thread *sth, usec_t n worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT); - char since[RFC3339_MAX_LENGTH]; - rfc3339_datetime_ut(since, sizeof(since), s->thread.last_traffic_ut, 2, false); + char duration[RFC3339_MAX_LENGTH]; + duration_snprintf(duration, sizeof(duration), (int64_t)(now_monotonic_usec() - s->thread.last_traffic_ut), "us", true); - char pending[64]; - size_snprintf(pending, sizeof(pending), outstanding, "B", false); + char pending[64] = "0"; + if(stats.bytes_outstanding) + size_snprintf(pending, sizeof(pending), stats.bytes_outstanding, "B", false); nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM SEND[%zu] %s [send to %s]: could not send data for %ld seconds - closing connection - " - "we have sent %zu bytes in %zu operations, it is idle since: %s, and we have %s pending to send " + "we have sent %zu bytes in %zu operations, it is idle for %s, and we have %s pending to send " "(buffer is used %.2f%%).", sth->id, rrdhost_hostname(s->host), s->connected_to, stream_send.parents.timeout_s, - s->thread.bytes_sent, s->thread.sends, since, pending, buffer_ratio); + stats.bytes_sent, stats.sends, + duration, pending, stats.buffer_ratio); stream_sender_move_running_to_connector_or_remove(sth, s, STREAM_HANDSHAKE_DISCONNECT_SOCKET_TIMEOUT, true); continue; } - bytes_compressed += s->thread.bytes_compressed; - bytes_uncompressed += s->thread.bytes_uncompressed; + bytes_compressed += stats.bytes_added; + bytes_uncompressed += stats.bytes_uncompressed; - if(!nd_poll_upd(sth->run.ndpl, s->sock.fd, ND_POLL_READ | (outstanding ? ND_POLL_WRITE : 0), &s->thread.meta)) + if(!nd_poll_upd(sth->run.ndpl, s->sock.fd, ND_POLL_READ | (stats.bytes_outstanding ? ND_POLL_WRITE : 0), &s->thread.meta)) nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM SEND[%zu] %s [send to %s]: failed to update nd_poll().", sth->id, rrdhost_hostname(s->host), s->connected_to); @@ -458,7 +442,7 @@ void stream_sender_process_poll_events(struct stream_thread *sth, struct sender_ }; ND_LOG_STACK_PUSH(lgs); - if(unlikely(events & ND_POLL_ERROR)) { + if(unlikely(events & (ND_POLL_ERROR|ND_POLL_HUP|ND_POLL_INVALID))) { // we have errors on this socket worker_is_busy(WORKER_STREAM_JOB_SOCKET_ERROR); @@ -474,9 +458,14 @@ void stream_sender_process_poll_events(struct stream_thread *sth, struct sender_ worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SOCKET_ERROR); + stream_sender_lock(s); + // copy the statistics + STREAM_CIRCULAR_BUFFER_STATS stats = *stream_circular_buffer_stats_unsafe(s->scb); + stream_sender_unlock(s); + nd_log(NDLS_DAEMON, NDLP_ERR, - "STREAM SEND[%zu] %s [send to %s]: %s restarting connection - %zu bytes transmitted.", - sth->id, rrdhost_hostname(s->host), s->connected_to, error, s->thread.bytes_sent); + "STREAM SEND[%zu] %s [to %s]: %s restarting connection - %zu bytes transmitted in %zu operations.", + sth->id, rrdhost_hostname(s->host), s->connected_to, error, stats.bytes_sent, stats.sends); stream_sender_move_running_to_connector_or_remove(sth, s, STREAM_HANDSHAKE_DISCONNECT_SOCKET_ERROR, true); return; @@ -485,90 +474,116 @@ void stream_sender_process_poll_events(struct stream_thread *sth, struct sender_ if(events & ND_POLL_WRITE) { // we can send data on this socket - worker_is_busy(WORKER_STREAM_JOB_SOCKET_SEND); + if(stream_sender_trylock(s)) { + worker_is_busy(WORKER_STREAM_JOB_SOCKET_SEND); - bool disconnect = false; - stream_sender_lock(s); - { + const char *disconnect_reason = NULL; + STREAM_HANDSHAKE reason; + + STREAM_CIRCULAR_BUFFER_STATS *stats = stream_circular_buffer_stats_unsafe(s->scb); char *chunk; - size_t outstanding = cbuffer_next_unsafe(s->sbuf.cb, &chunk); - ssize_t bytes = nd_sock_send_nowait(&s->sock, chunk, outstanding); - if (likely(bytes > 0)) { - cbuffer_remove_unsafe(s->sbuf.cb, bytes); - stream_sender_thread_data_sent_data_unsafe(s, bytes); + size_t outstanding = stream_circular_buffer_get_unsafe(s->scb, &chunk); + ssize_t rc = nd_sock_send_nowait(&s->sock, chunk, outstanding); + if (likely(rc > 0)) { + stream_circular_buffer_del_unsafe(s->scb, rc); + replication_recalculate_buffer_used_ratio_unsafe(s); s->thread.last_traffic_ut = now_ut; - sth->snd.bytes_sent += bytes; + sth->snd.bytes_sent += rc; - if(!s->thread.bytes_outstanding) { + if (!stats->bytes_outstanding) { // we sent them all - remove ND_POLL_WRITE - if(!nd_poll_upd(sth->run.ndpl, s->sock.fd, ND_POLL_READ, &s->thread.meta)) + if (!nd_poll_upd(sth->run.ndpl, s->sock.fd, ND_POLL_READ, &s->thread.meta)) nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM SEND[%zu] %s [send to %s]: failed to update nd_poll().", sth->id, rrdhost_hostname(s->host), s->connected_to); // recreate the circular buffer if we have to - stream_sender_cbuffer_recreate_timed_unsafe(s, now_ut, false); + stream_circular_buffer_recreate_timed_unsafe(s->scb, now_ut, false); } } - else if (bytes < 0 && errno != EWOULDBLOCK && errno != EAGAIN && errno != EINTR) - disconnect = true; - } - stream_sender_unlock(s); + else if (rc == 0 || errno == ECONNRESET) { + disconnect_reason = "socket reports EOF (closed by parent)"; + reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END; + } + else if (rc < 0) { + if(errno == EWOULDBLOCK || errno == EAGAIN || errno == EINTR) + // will try later + ; + else { + disconnect_reason = "socket reports error while writing"; + reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_WRITE_FAILED; + } + } + stream_sender_unlock(s); - if(disconnect) { - worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR); - nd_log(NDLS_DAEMON, NDLP_ERR, - "STREAM SEND[%zu] %s [send to %s]: failed to send metrics - restarting connection - " - "we have sent %zu bytes on this connection.", - sth->id, rrdhost_hostname(s->host), s->connected_to, s->thread.bytes_sent); - stream_sender_move_running_to_connector_or_remove( - sth, s, STREAM_HANDSHAKE_DISCONNECT_SOCKET_WRITE_FAILED, true); - return; + if (disconnect_reason) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR); + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM SEND[%zu] %s [to %s]: %s (%zd, on fd %d) - restarting connection - " + "we have sent %zu bytes in %zu operations.", + sth->id, rrdhost_hostname(s->host), s->connected_to, disconnect_reason, rc, s->sock.fd, + stats->bytes_sent, stats->sends); + + stream_sender_move_running_to_connector_or_remove(sth, s, reason, true); + + return; + } } } - if(events & ND_POLL_READ) { - // we can receive data from this socket + if(!(events & ND_POLL_READ)) + return; + + // we can receive data from this socket + + worker_is_busy(WORKER_STREAM_JOB_SOCKET_RECEIVE); + while(true) { + // we have to drain the socket! + + ssize_t rc = nd_sock_revc_nowait(&s->sock, s->rbuf.b + s->rbuf.read_len, sizeof(s->rbuf.b) - s->rbuf.read_len - 1); + if (likely(rc > 0)) { + s->rbuf.read_len += rc; - worker_is_busy(WORKER_STREAM_JOB_SOCKET_RECEIVE); - ssize_t bytes = nd_sock_revc_nowait(&s->sock, s->rbuf.b + s->rbuf.read_len, sizeof(s->rbuf.b) - s->rbuf.read_len - 1); - if (bytes > 0) { - s->rbuf.read_len += bytes; s->thread.last_traffic_ut = now_ut; - sth->snd.bytes_received += bytes; + sth->snd.bytes_received += rc; + + worker_is_busy(WORKER_SENDER_JOB_EXECUTE); + stream_sender_execute_commands(s); } - else if (bytes == 0 || errno == ECONNRESET) { - worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_PARENT_CLOSED); + else if (rc == 0 || errno == ECONNRESET) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_REMOTE_CLOSED); nd_log(NDLS_DAEMON, NDLP_ERR, - "STREAM SEND[%zu] %s [send to %s]: connection (fd %d) closed by far end.", + "STREAM SEND[%zu] %s [to %s]: socket %d reports EOF (closed by parent).", sth->id, rrdhost_hostname(s->host), s->connected_to, s->sock.fd); stream_sender_move_running_to_connector_or_remove( - sth, s, STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_PARENT, true); + sth, s, STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END, true); return; } - else if (bytes < 0 && errno != EWOULDBLOCK && errno != EAGAIN && errno != EINTR) { - worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR); - nd_log(NDLS_DAEMON, NDLP_ERR, - "STREAM SEND[%zu] %s [send to %s]: error during receive (%zd, on fd %d) - restarting connection.", - sth->id, rrdhost_hostname(s->host), s->connected_to, bytes, s->sock.fd); - stream_sender_move_running_to_connector_or_remove( - sth, s, STREAM_HANDSHAKE_DISCONNECT_SOCKET_READ_FAILED, true); - return; + else if (rc < 0) { + if(errno == EWOULDBLOCK || errno == EAGAIN || errno == EINTR) + // will try later + break; + else { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR); + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM SEND[%zu] %s [to %s]: error during receive (%zd, on fd %d) - restarting connection.", + sth->id, rrdhost_hostname(s->host), s->connected_to, rc, s->sock.fd); + stream_sender_move_running_to_connector_or_remove( + sth, s, STREAM_HANDSHAKE_DISCONNECT_SOCKET_READ_FAILED, true); + return; + } } } - - if(unlikely(s->rbuf.read_len)) { - worker_is_busy(WORKER_SENDER_JOB_EXECUTE); - stream_sender_execute_commands(s); - } } void stream_sender_cleanup(struct stream_thread *sth) { // stop all hosts Word_t idx = 0; - for(struct sender_state *s = SENDERS_FIRST(&sth->snd.senders, &idx); - s; - s = SENDERS_NEXT(&sth->snd.senders, &idx)) { + for(struct pollfd_meta *m = META_FIRST(&sth->run.meta, &idx); + m; + m = META_NEXT(&sth->run.meta, &idx)) { + if(m->type != POLLFD_TYPE_SENDER) continue; + struct sender_state *s = m->s; ND_LOG_STACK lgs[] = { ND_LOG_FIELD_STR(NDF_NIDL_NODE, s->host->hostname), @@ -583,8 +598,4 @@ void stream_sender_cleanup(struct stream_thread *sth) { stream_sender_move_running_to_connector_or_remove(sth, s, STREAM_HANDSHAKE_DISCONNECT_SHUTDOWN, false); } - - // cleanup - SENDERS_FREE(&sth->snd.senders, NULL); } - diff --git a/src/streaming/stream-thread.c b/src/streaming/stream-thread.c index 2f4e25e1328264..1a04e2a3e8917a 100644 --- a/src/streaming/stream-thread.c +++ b/src/streaming/stream-thread.c @@ -4,7 +4,7 @@ struct stream_thread_globals stream_thread_globals = { .assign = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, } }; @@ -16,20 +16,35 @@ static void stream_thread_handle_op(struct stream_thread *sth, struct stream_opc sth->messages.processed++; - struct sender_state *s = msg->sender ? SENDERS_GET(&sth->snd.senders, (Word_t)msg->sender) : NULL; - - if (msg->session && // there is a session - s && // there is a sender - (size_t)msg->thread_slot == sth->id) // same thread + struct pollfd_meta *m = META_GET(&sth->run.meta, (Word_t)msg->meta); + if (m && // there is a meta + m == msg->meta && // the meta are equal + msg->session && // there is a session + (size_t)msg->thread_slot == sth->id && // the right thread + (m->type == POLLFD_TYPE_SENDER || m->type == POLLFD_TYPE_RECEIVER) && // it is either sender or receiver + ((m->type == POLLFD_TYPE_SENDER && m == &m->s->thread.meta) || // sender matches + (m->type == POLLFD_TYPE_RECEIVER && m == &m->rpt->thread.meta))) // receiver matches { - if(msg->opcode & STREAM_OPCODE_SENDER_POLLOUT) { - if(!nd_poll_upd(sth->run.ndpl, s->sock.fd, ND_POLL_READ|ND_POLL_WRITE, &s->thread.meta)) - internal_fatal(true, "Failed to update sender socket in nd_poll()"); - msg->opcode &= ~(STREAM_OPCODE_SENDER_POLLOUT); + if(m->type == POLLFD_TYPE_SENDER) { + if(msg->opcode & STREAM_OPCODE_SENDER_POLLOUT) { + if(!nd_poll_upd(sth->run.ndpl, m->s->sock.fd, ND_POLL_READ|ND_POLL_WRITE, m)) + internal_fatal(true, "Failed to update sender socket in nd_poll()"); + msg->opcode &= ~(STREAM_OPCODE_SENDER_POLLOUT); + } + + if(msg->opcode) + stream_sender_handle_op(sth, m->s, msg); } + else if(m->type == POLLFD_TYPE_RECEIVER) { + if (msg->opcode & STREAM_OPCODE_RECEIVER_POLLOUT) { + if (!nd_poll_upd(sth->run.ndpl, m->rpt->sock.fd, ND_POLL_READ | ND_POLL_WRITE, m)) + internal_fatal(true, "Failed to update receiver socket in nd_poll()"); + msg->opcode &= ~(STREAM_OPCODE_RECEIVER_POLLOUT); + } - if(msg->opcode) - stream_sender_handle_op(sth, s, msg); + if (msg->opcode) + stream_receiver_handle_op(sth, m->rpt, msg); + } } else { // this may happen if we receive a POLLOUT opcode, but the sender has been disconnected @@ -37,36 +52,125 @@ static void stream_thread_handle_op(struct stream_thread *sth, struct stream_opc } } -void stream_sender_send_msg_to_dispatcher(struct sender_state *s, struct stream_opcode msg) { - if (!msg.session || !msg.sender || !s) +static void stream_thread_send_pipe_signal(struct stream_thread *sth) { + if(sth->tid == gettid_cached()) + // no need for this if we are the same thread + // we will process all the events shortly return; - internal_fatal(msg.sender != s, "the sender pointer in the message does not match this sender"); + if(sth->pipe.fds[PIPE_WRITE] != -1 && + write(sth->pipe.fds[PIPE_WRITE], " ", 1) != 1) { + nd_log_limit_static_global_var(erl, 1, 1 * USEC_PER_MS); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR, + "STREAM THREAD[%zu]: cannot write to signal pipe", sth->id); + } +} + +void stream_receiver_send_opcode(struct receiver_state *rpt, struct stream_opcode msg) { + if (!msg.session || !msg.meta || !rpt) + return; + internal_fatal(msg.meta != &rpt->thread.meta, "the receiver pointer in the message does not match this receiver"); struct stream_thread *sth = stream_thread_by_slot_id(msg.thread_slot); if(!sth) { internal_fatal(true, - "STREAM SEND[x] [%s] thread pointer in the opcode message does not match the expected", - rrdhost_hostname(s->host)); + "STREAM RECEIVE[x] [%s] thread pointer in the opcode message does not match the expected", + rrdhost_hostname(rpt->host)); + return; + } + + // check if we can execute the message now + if(msg.opcode == STREAM_OPCODE_RECEIVER_POLLOUT && sth->tid == gettid_cached()) { + // we are running at the stream thread, and the request is about enabling POLLOUT, + // we can do this synchronously. + // IMPORTANT: DO NOT HANDLE FAILURES THAT REMOVE THE RECEIVER OR THE SENDER THIS WAY + // THE EVENT LOOP DRAINS THE INPUT SOCKET (BOTH RECEIVER AND SENDER) + // AND THE LOOP WILL CRASH IF THE RECEIVER OR THE SENDER VANISH WHILE IT + // WORKS WITH THEM! + sth->messages.bypassed++; + stream_thread_handle_op(sth, &msg); return; } bool send_pipe_msg = false; + // add it to the message queue of the thread + spinlock_lock(&sth->messages.spinlock); + { + sth->messages.added++; + if (rpt->thread.send_to_child.msg_slot >= sth->messages.used || sth->messages.array[rpt->thread.send_to_child.msg_slot].meta != &rpt->thread.meta) { + if (unlikely(sth->messages.used >= sth->messages.size)) { + // this should never happen, but let's find the root cause + + if (!sth->messages.size) { + // we are exiting + spinlock_unlock(&sth->messages.spinlock); + return; + } + + // try to find us in the list + for (size_t i = 0; i < sth->messages.size; i++) { + if (sth->messages.array[i].meta == &rpt->thread.meta) { + rpt->thread.send_to_child.msg_slot = i; + sth->messages.array[rpt->thread.send_to_child.msg_slot].opcode |= msg.opcode; + spinlock_unlock(&sth->messages.spinlock); + internal_fatal(true, "the stream opcode queue is full, but this receiver is already on slot %zu", i); + return; + } + } + + fatal("The streaming opcode queue is full, but this should never happen"); + } + + // let's use a new slot + send_pipe_msg = !sth->messages.used; // write to the pipe, only when the queue was empty before this msg + rpt->thread.send_to_child.msg_slot = sth->messages.used++; + sth->messages.array[rpt->thread.send_to_child.msg_slot] = msg; + } + else + // the existing slot is good + sth->messages.array[rpt->thread.send_to_child.msg_slot].opcode |= msg.opcode; + } + spinlock_unlock(&sth->messages.spinlock); + + // signal the streaming thread to wake up and process messages + if(send_pipe_msg) + stream_thread_send_pipe_signal(sth); +} + +void stream_sender_send_opcode(struct sender_state *s, struct stream_opcode msg) { + if (!msg.session || !msg.meta || !s) + return; + + internal_fatal(msg.meta != &s->thread.meta, "the sender pointer in the message does not match this sender"); + struct stream_thread *sth = stream_thread_by_slot_id(msg.thread_slot); + if(!sth) { + internal_fatal(true, + "STREAM SEND[x] [%s] thread pointer in the opcode message does not match the expected", + rrdhost_hostname(s->host)); + return; + } + // check if we can execute the message now - if(sth->tid == gettid_cached()) { - // we are running at the dispatcher thread - // no need for locks or queuing + if(msg.opcode == STREAM_OPCODE_SENDER_POLLOUT && sth->tid == gettid_cached()) { + // we are running at the stream thread, and the request is about enabling POLLOUT, + // we can do this synchronously. + // IMPORTANT: DO NOT HANDLE FAILURES THAT REMOVE THE RECEIVER OR THE SENDER THIS WAY + // THE EVENT LOOP DRAINS THE INPUT SOCKET (BOTH RECEIVER AND SENDER) + // AND THE LOOP WILL CRASH IF THE RECEIVER OR THE SENDER VANISH WHILE IT + // WORKS WITH THEM! sth->messages.bypassed++; stream_thread_handle_op(sth, &msg); return; } + bool send_pipe_msg = false; + // add it to the message queue of the thread spinlock_lock(&sth->messages.spinlock); { sth->messages.added++; - if (s->thread.msg_slot >= sth->messages.used || sth->messages.array[s->thread.msg_slot].sender != s) { + if (s->thread.msg_slot >= sth->messages.used || sth->messages.array[s->thread.msg_slot].meta != &s->thread.meta) { if (unlikely(sth->messages.used >= sth->messages.size)) { // this should never happen, but let's find the root cause @@ -78,7 +182,7 @@ void stream_sender_send_msg_to_dispatcher(struct sender_state *s, struct stream_ // try to find us in the list for (size_t i = 0; i < sth->messages.size; i++) { - if (sth->messages.array[i].sender == s) { + if (sth->messages.array[i].meta == &s->thread.meta) { s->thread.msg_slot = i; sth->messages.array[s->thread.msg_slot].opcode |= msg.opcode; spinlock_unlock(&sth->messages.spinlock); @@ -87,7 +191,7 @@ void stream_sender_send_msg_to_dispatcher(struct sender_state *s, struct stream_ } } - fatal("the dispatcher message queue is full, but this should never happen"); + fatal("the streaming opcode queue is full, but this should never happen"); } // let's use a new slot @@ -102,14 +206,8 @@ void stream_sender_send_msg_to_dispatcher(struct sender_state *s, struct stream_ spinlock_unlock(&sth->messages.spinlock); // signal the streaming thread to wake up and process messages - if(send_pipe_msg && - sth->pipe.fds[PIPE_WRITE] != -1 && - write(sth->pipe.fds[PIPE_WRITE], " ", 1) != 1) { - nd_log_limit_static_global_var(erl, 1, 1 * USEC_PER_MS); - nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR, - "STREAM SEND [%s]: cannot write to signal pipe", - rrdhost_hostname(s->host)); - } + if(send_pipe_msg) + stream_thread_send_pipe_signal(sth); } static void stream_thread_read_pipe_messages(struct stream_thread *sth) { @@ -184,7 +282,6 @@ static bool stream_thread_process_poll_slot(struct stream_thread *sth, nd_poll_r switch(m->type) { case POLLFD_TYPE_SENDER: { struct sender_state *s = m->s; - internal_fatal(SENDERS_GET(&sth->snd.senders, (Word_t)s) == NULL, "Sender is not found in the senders list"); stream_sender_process_poll_events(sth, s, ev->events, now_ut); *replay_entries += dictionary_entries(s->replication.requests); break; @@ -192,7 +289,6 @@ static bool stream_thread_process_poll_slot(struct stream_thread *sth, nd_poll_r case POLLFD_TYPE_RECEIVER: { struct receiver_state *rpt = m->rpt; - internal_fatal(RECEIVERS_GET(&sth->rcv.receivers, (Word_t)rpt) == NULL, "Receiver is not found in the receiver list"); stream_receive_process_poll_events(sth, rpt, ev->events, now_ut); break; } @@ -250,7 +346,7 @@ void *stream_thread(void *ptr) { worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_OVERFLOW, "disconnect overflow"); worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT, "disconnect timeout"); worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_SOCKET_ERROR, "disconnect socket error"); - worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_PARENT_CLOSED, "disconnect parent closed"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_REMOTE_CLOSED, "disconnect remote closed"); worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR, "disconnect receive error"); worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR, "disconnect send error"); worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_COMPRESSION_ERROR, "disconnect compression error"); @@ -328,6 +424,8 @@ void *stream_thread(void *ptr) { if(!sth->run.ndpl) fatal("Cannot create nd_poll()"); + META_SET(&sth->run.meta, (Word_t)&sth->run.pipe, &sth->run.pipe); + if(!nd_poll_add(sth->run.ndpl, sth->pipe.fds[PIPE_READ], ND_POLL_READ, &sth->run.pipe)) internal_fatal(true, "Failed to add pipe to nd_poll()"); @@ -393,6 +491,11 @@ void *stream_thread(void *ptr) { if(nd_thread_signaled_to_cancel() || !service_running(SERVICE_STREAMING)) break; + // nd_poll() may have received events for a socket we have already removed + // so, if we don't find it in our meta index, do not access it - it has been removed + if(META_GET(&sth->run.meta, (Word_t)ev.data) != ev.data) + continue; + now_ut = now_monotonic_usec(); exit_thread = stream_thread_process_poll_slot(sth, &ev, now_ut, &replay_entries); } @@ -406,6 +509,7 @@ void *stream_thread(void *ptr) { // cleanup receiver and dispatcher stream_sender_cleanup(sth); stream_receiver_cleanup(sth); + META_FREE(&sth->run.meta, NULL); // cleanup the thread structures spinlock_lock(&sth->messages.spinlock); diff --git a/src/streaming/stream-thread.h b/src/streaming/stream-thread.h index 511a31fe9891af..b3587e642920fa 100644 --- a/src/streaming/stream-thread.h +++ b/src/streaming/stream-thread.h @@ -4,6 +4,7 @@ #define NETDATA_STREAM_THREAD_H #include "libnetdata/libnetdata.h" +#include "stream-circular-buffer.h" struct stream_thread; struct pollfd_slotted { @@ -17,17 +18,19 @@ struct pollfd_slotted { typedef enum __attribute__((packed)) { STREAM_OPCODE_NONE = 0, STREAM_OPCODE_SENDER_POLLOUT = (1 << 0), // move traffic around as soon as possible - STREAM_OPCODE_SENDER_BUFFER_OVERFLOW = (1 << 1), // reconnect the node, it has buffer overflow - STREAM_OPCODE_SENDER_RECONNECT_WITHOUT_COMPRESSION = (1 << 2), // reconnect the node, but disable compression - STREAM_OPCODE_SENDER_STOP_RECEIVER_LEFT = (1 << 3), // disconnect the node, the receiver left - STREAM_OPCODE_SENDER_STOP_HOST_CLEANUP = (1 << 4), // disconnect the node, it is being de-allocated + STREAM_OPCODE_RECEIVER_POLLOUT = (1 << 1), // disconnect the node, it has buffer overflow + STREAM_OPCODE_SENDER_BUFFER_OVERFLOW = (1 << 2), // reconnect the node, it has buffer overflow + STREAM_OPCODE_RECEIVER_BUFFER_OVERFLOW = (1 << 3), // reconnect the node, it has buffer overflow + STREAM_OPCODE_SENDER_RECONNECT_WITHOUT_COMPRESSION = (1 << 4), // reconnect the node, but disable compression + STREAM_OPCODE_SENDER_STOP_RECEIVER_LEFT = (1 << 5), // disconnect the node, the receiver left + STREAM_OPCODE_SENDER_STOP_HOST_CLEANUP = (1 << 6), // disconnect the node, it is being de-allocated } STREAM_OPCODE; struct stream_opcode { int32_t thread_slot; // the dispatcher id this message refers to uint32_t session; // random number used to verify that the message the dispatcher receives is for this sender STREAM_OPCODE opcode; // the actual message to be delivered - struct sender_state *sender; + struct pollfd_meta *meta; }; // IMPORTANT: to add workers, you have to edit WORKER_PARSER_FIRST_JOB accordingly @@ -61,7 +64,7 @@ struct stream_opcode { #define WORKER_SENDER_JOB_DISCONNECT_OVERFLOW (WORKER_PARSER_FIRST_JOB - 18) #define WORKER_SENDER_JOB_DISCONNECT_TIMEOUT (WORKER_PARSER_FIRST_JOB - 17) #define WORKER_SENDER_JOB_DISCONNECT_SOCKET_ERROR (WORKER_PARSER_FIRST_JOB - 16) -#define WORKER_SENDER_JOB_DISCONNECT_PARENT_CLOSED (WORKER_PARSER_FIRST_JOB - 15) +#define WORKER_SENDER_JOB_DISCONNECT_REMOTE_CLOSED (WORKER_PARSER_FIRST_JOB - 15) #define WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR (WORKER_PARSER_FIRST_JOB - 14) #define WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR (WORKER_PARSER_FIRST_JOB - 13) #define WORKER_SENDER_JOB_DISCONNECT_COMPRESSION_ERROR (WORKER_PARSER_FIRST_JOB - 12) @@ -105,6 +108,7 @@ struct pollfd_meta { DEFINE_JUDYL_TYPED(SENDERS, struct sender_state *); DEFINE_JUDYL_TYPED(RECEIVERS, struct receiver_state *); +DEFINE_JUDYL_TYPED(META, struct pollfd_meta *); struct stream_thread { ND_THREAD *thread; @@ -114,13 +118,11 @@ struct stream_thread { size_t nodes_count; struct { - SENDERS_JudyLSet senders; size_t bytes_received; size_t bytes_sent; } snd; struct { - RECEIVERS_JudyLSet receivers; size_t bytes_received; size_t bytes_received_uncompressed; NETDATA_DOUBLE replication_completion; @@ -155,6 +157,7 @@ struct stream_thread { struct { nd_poll_t *ndpl; struct pollfd_meta pipe; + META_JudyLSet meta; } run; }; diff --git a/src/streaming/stream.h b/src/streaming/stream.h index 27ed5493c232ae..a4e4e08f98f08d 100644 --- a/src/streaming/stream.h +++ b/src/streaming/stream.h @@ -38,7 +38,6 @@ void stream_receiver_free(struct receiver_state *rpt); bool stream_receiver_signal_to_stop_and_wait(struct rrdhost *host, STREAM_HANDSHAKE reason); char *stream_receiver_program_version_strdupz(struct rrdhost *host); -#include "replication.h" #include "rrdhost-status.h" #include "protocol/commands.h" #include "stream-path.h" diff --git a/src/web/api/queries/query.c b/src/web/api/queries/query.c index 005572f0c9a5e8..c5acfccad05f32 100644 --- a/src/web/api/queries/query.c +++ b/src/web/api/queries/query.c @@ -2018,7 +2018,7 @@ void rrdr_fill_tier_gap_from_smaller_tiers(RRDDIM *rd, size_t tier, time_t now_s storage_engine_query_finalize(&seqh); store_metric_collection_completed(); - telemetry_queries_backfill_query_completed(points_read); + pulse_queries_backfill_query_completed(points_read); //internal_error(true, "DBENGINE: backfilled chart '%s', dimension '%s', tier %d, from %ld to %ld, with %zu points from tier %d", // rd->rrdset->name, rd->name, tier, after_wanted, before_wanted, points, tr); @@ -3592,7 +3592,7 @@ RRDR *rrd2rrdr(ONEWAYALLOC *owa, QUERY_TARGET *qt) { continue; } - telemetry_queries_rrdr_query_completed( + pulse_queries_rrdr_query_completed( 1, r_tmp->stats.db_points_read - last_db_points_read, r_tmp->stats.result_points_generated - last_result_points_generated, diff --git a/src/web/api/v3/api_v3_settings.c b/src/web/api/v3/api_v3_settings.c index 3b02e6b61c0e2d..51a18405dc8d25 100644 --- a/src/web/api/v3/api_v3_settings.c +++ b/src/web/api/v3/api_v3_settings.c @@ -37,7 +37,7 @@ // we need an r/w spinlock to ensure that reads and write do not happen // concurrently for settings files -static RW_SPINLOCK settings_spinlock = NETDATA_RW_SPINLOCK_INITIALIZER; +static RW_SPINLOCK settings_spinlock = RW_SPINLOCK_INITIALIZER; static inline void settings_path(char out[FILENAME_MAX]) { filename_from_path_entry(out, netdata_configured_varlib_dir, "settings", NULL); diff --git a/src/web/rtc/webrtc.c b/src/web/rtc/webrtc.c index 07cbfc5e66ffd6..4267b340d45cae 100644 --- a/src/web/rtc/webrtc.c +++ b/src/web/rtc/webrtc.c @@ -115,7 +115,7 @@ static struct { .proxyServer = NULL, // [("http"|"socks5") (":"|"://")][username ":" password "@"]hostname[" :" port] .bindAddress = NULL, .unsafe = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .head = NULL, }, }; diff --git a/src/web/server/static/static-threaded.c b/src/web/server/static/static-threaded.c index c874f56fd9ccde..b2609fa3b278db 100644 --- a/src/web/server/static/static-threaded.c +++ b/src/web/server/static/static-threaded.c @@ -123,7 +123,7 @@ static void web_server_file_del_callback(POLLINFO *pi) { web_server_log_connection(w, "DISCONNECTED"); web_client_request_done(w); web_client_release_to_cache(w); - telemetry_web_client_disconnected(); + pulse_web_client_disconnected(); } worker_is_idle(); @@ -269,7 +269,7 @@ static void web_server_del_callback(POLLINFO *pi) { web_server_log_connection(w, "DISCONNECTED"); web_client_request_done(w); web_client_release_to_cache(w); - telemetry_web_client_disconnected(); + pulse_web_client_disconnected(); } worker_is_idle(); @@ -509,7 +509,8 @@ void *socket_listen_main_static_threaded(void *ptr) { // 6 threads is the optimal value // since 6 are the parallel connections browsers will do // so, if the machine has more CPUs, avoid using resources unnecessarily - int def_thread_count = MIN(get_netdata_cpus(), 6); + int def_thread_count = (int)get_netdata_cpus(); + if(def_thread_count < 6) def_thread_count = 6; if (!strcmp(config_get(CONFIG_SECTION_WEB, "mode", ""),"single-threaded")) { netdata_log_info("Running web server with one thread, because mode is single-threaded"); diff --git a/src/web/server/web_client.c b/src/web/server/web_client.c index a688f95be0d4f0..5357e3e835caa4 100644 --- a/src/web/server/web_client.c +++ b/src/web/server/web_client.c @@ -225,7 +225,7 @@ void web_client_log_completed_request(struct web_client *w, bool update_web_stat size_t sent = w->response.zoutput ? (size_t)w->response.zstream.total_out : size; if(update_web_stats) - telemetry_web_request_completed( + pulse_web_request_completed( dt_usec(&tv, &w->timings.tv_in), w->statistics.received_bytes, w->statistics.sent_bytes, size, sent); usec_t prep_ut = w->timings.tv_ready.tv_sec ? dt_usec(&w->timings.tv_ready, &w->timings.tv_in) : 0; diff --git a/src/web/server/web_client_cache.c b/src/web/server/web_client_cache.c index 2a0ade755a6917..16dcd1631b8a80 100644 --- a/src/web/server/web_client_cache.c +++ b/src/web/server/web_client_cache.c @@ -33,14 +33,14 @@ static struct clients_cache { } avail; } web_clients_cache = { .used = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .head = NULL, .count = 0, .reused = 0, .allocated = 0, }, .avail = { - .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .spinlock = SPINLOCK_INITIALIZER, .head = NULL, .count = 0, }, @@ -103,7 +103,7 @@ struct web_client *web_client_get_from_cache(void) { w = web_client_create(&netdata_buffers_statistics.buffers_web); spinlock_lock(&web_clients_cache.used.spinlock); - w->id = telemetry_web_client_connected(); + w->id = pulse_web_client_connected(); web_clients_cache.used.allocated++; }