From ebda01f576afdad37cf671d5cb929feeb22a5a4b Mon Sep 17 00:00:00 2001 From: Netdata bot <43409846+netdatabot@users.noreply.github.com> Date: Fri, 12 Jul 2024 03:04:13 -0400 Subject: [PATCH 01/18] Regenerate integrations.js (#18124) Co-authored-by: ilyam8 <22274335+ilyam8@users.noreply.github.com> --- integrations/integrations.js | 2 +- integrations/integrations.json | 2 +- src/go/plugin/go.d/modules/smartctl/integrations/s.m.a.r.t..md | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/integrations/integrations.js b/integrations/integrations.js index f391fcf9dc5965..f1632a43b7b70b 100644 --- a/integrations/integrations.js +++ b/integrations/integrations.js @@ -16105,7 +16105,7 @@ export const integrations = [ "setup": "## Setup\n\n### Prerequisites\n\n#### Install smartmontools (v7.0+)\n\nInstall `smartmontools` version 7.0 or later using your distribution's package manager. Version 7.0 introduced the `--json` output mode, which is required for this collector to function properly.\n\n\n#### For Netdata running in a Docker container\n\nNetdata requires the `SYS_RAWIO` capability and access to the storage devices to run the `smartctl` collector inside a Docker container. Here's how you can achieve this:\n\n- `docker run`\n\n ```bash\n docker run --cap-add SYS_RAWIO --device /dev/sda:/dev/sda ...\n ```\n\n- `docker-compose.yml`\n\n ```yaml\n services:\n netdata:\n cap_add:\n - SYS_PTRACE\n - SYS_ADMIN\n - SYS_RAWIO # smartctl\n devices:\n - \"/dev/sda:/dev/sda\"\n ```\n\n> **Multiple Devices**: These examples only show mapping of one device (/dev/sda). You'll need to add additional `--device` options (in docker run) or entries in the `devices` list (in docker-compose.yml) for each storage device you want Netdata's smartctl collector to monitor.\n\n> **NVMe Devices**: Do not map NVMe devices using this method. Netdata uses a [dedicated collector](https://github.com/netdata/netdata/tree/master/src/go/plugin/go.d/modules/nvme#readme) to monitor NVMe devices.\n\n\n\n### Configuration\n\n#### File\n\nThe configuration file name for this integration is `go.d/smartctl.conf`.\n\n\nYou can edit the configuration file using the `edit-config` script from the\nNetdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).\n\n```bash\ncd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata\nsudo ./edit-config go.d/smartctl.conf\n```\n#### Options\n\nThe following options can be defined globally: update_every.\n\n\n{% details open=true summary=\"Config options\" %}\n| Name | Description | Default | Required |\n|:----|:-----------|:-------|:--------:|\n| update_every | interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**. | 10 | no |\n| timeout | smartctl binary execution timeout. | 5 | no |\n| scan_every | interval for discovering new devices using `smartctl --scan`, measured in seconds. | 900 | no |\n| poll_devices_every | interval for gathering data for every device, measured in seconds. Data is cached for this interval. | 300 | no |\n| device_selector | Specifies a pattern to match the 'info name' of devices as reported by `smartctl --scan --json`. | * | no |\n| no_check_power_mode | Skip data collection when the device is in a low-power mode. Prevents unnecessary disk spin-up. | standby | no |\n\n##### no_check_power_mode\n\nThe valid arguments to this option are:\n\n| Mode | Description |\n|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| never | Check the device always. |\n| sleep | Check the device unless it is in SLEEP mode. |\n| standby | Check the device unless it is in SLEEP or STANDBY mode. In these modes most disks are not spinning, so if you want to prevent a disk from spinning up, this is probably what you want. |\n| idle | Check the device unless it is in SLEEP, STANDBY or IDLE mode. In the IDLE state, most disks are still spinning, so this is probably not what you want. |\n\n\n{% /details %}\n#### Examples\n\n##### Custom devices poll interval\n\nAllows you to override the default devices poll interval (data collection).\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: smartctl\n devices_poll_interval: 60 # Collect S.M.A.R.T statistics every 60 seconds\n\n```\n{% /details %}\n", "troubleshooting": "## Troubleshooting\n\n### Debug Mode\n\nTo troubleshoot issues with the `smartctl` collector, run the `go.d.plugin` with the debug option enabled. The output\nshould give you clues as to why the collector isn't working.\n\n- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on\n your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`.\n\n ```bash\n cd /usr/libexec/netdata/plugins.d/\n ```\n\n- Switch to the `netdata` user.\n\n ```bash\n sudo -u netdata -s\n ```\n\n- Run the `go.d.plugin` to debug the collector:\n\n ```bash\n ./go.d.plugin -d -m smartctl\n ```\n\n### Getting Logs\n\nIf you're encountering problems with the `smartctl` collector, follow these steps to retrieve logs and identify potential issues:\n\n- **Run the command** specific to your system (systemd, non-systemd, or Docker container).\n- **Examine the output** for any warnings or error messages that might indicate issues. These messages should provide clues about the root cause of the problem.\n\n#### System with systemd\n\nUse the following command to view logs generated since the last Netdata service restart:\n\n```bash\njournalctl _SYSTEMD_INVOCATION_ID=\"$(systemctl show --value --property=InvocationID netdata)\" --namespace=netdata --grep smartctl\n```\n\n#### System without systemd\n\nLocate the collector log file, typically at `/var/log/netdata/collector.log`, and use `grep` to filter for collector's name:\n\n```bash\ngrep smartctl /var/log/netdata/collector.log\n```\n\n**Note**: This method shows logs from all restarts. Focus on the **latest entries** for troubleshooting current issues.\n\n#### Docker Container\n\nIf your Netdata runs in a Docker container named \"netdata\" (replace if different), use this command:\n\n```bash\ndocker logs netdata 2>&1 | grep smartctl\n```\n\n", "alerts": "## Alerts\n\nThere are no alerts configured by default for this integration.\n", - "metrics": "## Metrics\n\nMetrics grouped by *scope*.\n\nThe scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels.\n\n\n\n### Per controller\n\nThese metrics refer to the Storage Device.\n\nLabels:\n\n| Label | Description |\n|:-----------|:----------------|\n| device_name | Device name |\n| device_type | Device type |\n| model_name | Model name |\n| serial_number | Serial number |\n\nMetrics:\n\n| Metric | Dimensions | Unit |\n|:------|:----------|:----|\n| smartctl.device_smart_status | passed, failed | status |\n| smartctl.device_ata_smart_error_log_count | error_log | logs |\n| smartctl.device_power_on_time | power_on_time | seconds |\n| smartctl.device_temperature | temperature | Celsius |\n| smartctl.device_power_cycles_count | power | cycles |\n| smartctl.device_smart_attr_{attribute_name} | {attribute_name} | {attribute_unit} |\n| smartctl.device_smart_attr_{attribute_name}_normalized | {attribute_name} | value |\n\n", + "metrics": "## Metrics\n\nMetrics grouped by *scope*.\n\nThe scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels.\n\n\n\n### Per controller\n\nThese metrics refer to the Storage Device.\n\nLabels:\n\n| Label | Description |\n|:-----------|:----------------|\n| device_name | Device name |\n| device_type | Device type |\n| model_name | Model name |\n| serial_number | Serial number |\n\nMetrics:\n\n| Metric | Dimensions | Unit |\n|:------|:----------|:----|\n| smartctl.device_smart_status | passed, failed | status |\n| smartctl.device_ata_smart_error_log_count | error_log | logs |\n| smartctl.device_power_on_time | power_on_time | seconds |\n| smartctl.device_temperature | temperature | Celsius |\n| smartctl.device_power_cycles_count | power | cycles |\n| smartctl.device_read_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_write_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_verify_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_smart_attr_{attribute_name} | {attribute_name} | {attribute_unit} |\n| smartctl.device_smart_attr_{attribute_name}_normalized | {attribute_name} | value |\n\n", "integration_type": "collector", "id": "go.d.plugin-smartctl-S.M.A.R.T.", "edit_link": "https://github.com/netdata/netdata/blob/master/src/go/plugin/go.d/modules/smartctl/metadata.yaml", diff --git a/integrations/integrations.json b/integrations/integrations.json index 8708ead73fa0da..9b677b10c2a402 100644 --- a/integrations/integrations.json +++ b/integrations/integrations.json @@ -16103,7 +16103,7 @@ "setup": "## Setup\n\n### Prerequisites\n\n#### Install smartmontools (v7.0+)\n\nInstall `smartmontools` version 7.0 or later using your distribution's package manager. Version 7.0 introduced the `--json` output mode, which is required for this collector to function properly.\n\n\n#### For Netdata running in a Docker container\n\nNetdata requires the `SYS_RAWIO` capability and access to the storage devices to run the `smartctl` collector inside a Docker container. Here's how you can achieve this:\n\n- `docker run`\n\n ```bash\n docker run --cap-add SYS_RAWIO --device /dev/sda:/dev/sda ...\n ```\n\n- `docker-compose.yml`\n\n ```yaml\n services:\n netdata:\n cap_add:\n - SYS_PTRACE\n - SYS_ADMIN\n - SYS_RAWIO # smartctl\n devices:\n - \"/dev/sda:/dev/sda\"\n ```\n\n> **Multiple Devices**: These examples only show mapping of one device (/dev/sda). You'll need to add additional `--device` options (in docker run) or entries in the `devices` list (in docker-compose.yml) for each storage device you want Netdata's smartctl collector to monitor.\n\n> **NVMe Devices**: Do not map NVMe devices using this method. Netdata uses a [dedicated collector](https://github.com/netdata/netdata/tree/master/src/go/plugin/go.d/modules/nvme#readme) to monitor NVMe devices.\n\n\n\n### Configuration\n\n#### File\n\nThe configuration file name for this integration is `go.d/smartctl.conf`.\n\n\nYou can edit the configuration file using the `edit-config` script from the\nNetdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).\n\n```bash\ncd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata\nsudo ./edit-config go.d/smartctl.conf\n```\n#### Options\n\nThe following options can be defined globally: update_every.\n\n\n| Name | Description | Default | Required |\n|:----|:-----------|:-------|:--------:|\n| update_every | interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**. | 10 | no |\n| timeout | smartctl binary execution timeout. | 5 | no |\n| scan_every | interval for discovering new devices using `smartctl --scan`, measured in seconds. | 900 | no |\n| poll_devices_every | interval for gathering data for every device, measured in seconds. Data is cached for this interval. | 300 | no |\n| device_selector | Specifies a pattern to match the 'info name' of devices as reported by `smartctl --scan --json`. | * | no |\n| no_check_power_mode | Skip data collection when the device is in a low-power mode. Prevents unnecessary disk spin-up. | standby | no |\n\n##### no_check_power_mode\n\nThe valid arguments to this option are:\n\n| Mode | Description |\n|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| never | Check the device always. |\n| sleep | Check the device unless it is in SLEEP mode. |\n| standby | Check the device unless it is in SLEEP or STANDBY mode. In these modes most disks are not spinning, so if you want to prevent a disk from spinning up, this is probably what you want. |\n| idle | Check the device unless it is in SLEEP, STANDBY or IDLE mode. In the IDLE state, most disks are still spinning, so this is probably not what you want. |\n\n\n#### Examples\n\n##### Custom devices poll interval\n\nAllows you to override the default devices poll interval (data collection).\n\n```yaml\njobs:\n - name: smartctl\n devices_poll_interval: 60 # Collect S.M.A.R.T statistics every 60 seconds\n\n```\n", "troubleshooting": "## Troubleshooting\n\n### Debug Mode\n\nTo troubleshoot issues with the `smartctl` collector, run the `go.d.plugin` with the debug option enabled. The output\nshould give you clues as to why the collector isn't working.\n\n- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on\n your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`.\n\n ```bash\n cd /usr/libexec/netdata/plugins.d/\n ```\n\n- Switch to the `netdata` user.\n\n ```bash\n sudo -u netdata -s\n ```\n\n- Run the `go.d.plugin` to debug the collector:\n\n ```bash\n ./go.d.plugin -d -m smartctl\n ```\n\n### Getting Logs\n\nIf you're encountering problems with the `smartctl` collector, follow these steps to retrieve logs and identify potential issues:\n\n- **Run the command** specific to your system (systemd, non-systemd, or Docker container).\n- **Examine the output** for any warnings or error messages that might indicate issues. These messages should provide clues about the root cause of the problem.\n\n#### System with systemd\n\nUse the following command to view logs generated since the last Netdata service restart:\n\n```bash\njournalctl _SYSTEMD_INVOCATION_ID=\"$(systemctl show --value --property=InvocationID netdata)\" --namespace=netdata --grep smartctl\n```\n\n#### System without systemd\n\nLocate the collector log file, typically at `/var/log/netdata/collector.log`, and use `grep` to filter for collector's name:\n\n```bash\ngrep smartctl /var/log/netdata/collector.log\n```\n\n**Note**: This method shows logs from all restarts. Focus on the **latest entries** for troubleshooting current issues.\n\n#### Docker Container\n\nIf your Netdata runs in a Docker container named \"netdata\" (replace if different), use this command:\n\n```bash\ndocker logs netdata 2>&1 | grep smartctl\n```\n\n", "alerts": "## Alerts\n\nThere are no alerts configured by default for this integration.\n", - "metrics": "## Metrics\n\nMetrics grouped by *scope*.\n\nThe scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels.\n\n\n\n### Per controller\n\nThese metrics refer to the Storage Device.\n\nLabels:\n\n| Label | Description |\n|:-----------|:----------------|\n| device_name | Device name |\n| device_type | Device type |\n| model_name | Model name |\n| serial_number | Serial number |\n\nMetrics:\n\n| Metric | Dimensions | Unit |\n|:------|:----------|:----|\n| smartctl.device_smart_status | passed, failed | status |\n| smartctl.device_ata_smart_error_log_count | error_log | logs |\n| smartctl.device_power_on_time | power_on_time | seconds |\n| smartctl.device_temperature | temperature | Celsius |\n| smartctl.device_power_cycles_count | power | cycles |\n| smartctl.device_smart_attr_{attribute_name} | {attribute_name} | {attribute_unit} |\n| smartctl.device_smart_attr_{attribute_name}_normalized | {attribute_name} | value |\n\n", + "metrics": "## Metrics\n\nMetrics grouped by *scope*.\n\nThe scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels.\n\n\n\n### Per controller\n\nThese metrics refer to the Storage Device.\n\nLabels:\n\n| Label | Description |\n|:-----------|:----------------|\n| device_name | Device name |\n| device_type | Device type |\n| model_name | Model name |\n| serial_number | Serial number |\n\nMetrics:\n\n| Metric | Dimensions | Unit |\n|:------|:----------|:----|\n| smartctl.device_smart_status | passed, failed | status |\n| smartctl.device_ata_smart_error_log_count | error_log | logs |\n| smartctl.device_power_on_time | power_on_time | seconds |\n| smartctl.device_temperature | temperature | Celsius |\n| smartctl.device_power_cycles_count | power | cycles |\n| smartctl.device_read_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_write_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_verify_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_smart_attr_{attribute_name} | {attribute_name} | {attribute_unit} |\n| smartctl.device_smart_attr_{attribute_name}_normalized | {attribute_name} | value |\n\n", "integration_type": "collector", "id": "go.d.plugin-smartctl-S.M.A.R.T.", "edit_link": "https://github.com/netdata/netdata/blob/master/src/go/plugin/go.d/modules/smartctl/metadata.yaml", diff --git a/src/go/plugin/go.d/modules/smartctl/integrations/s.m.a.r.t..md b/src/go/plugin/go.d/modules/smartctl/integrations/s.m.a.r.t..md index f645cee4f72682..1b893b488a2750 100644 --- a/src/go/plugin/go.d/modules/smartctl/integrations/s.m.a.r.t..md +++ b/src/go/plugin/go.d/modules/smartctl/integrations/s.m.a.r.t..md @@ -83,6 +83,9 @@ Metrics: | smartctl.device_power_on_time | power_on_time | seconds | | smartctl.device_temperature | temperature | Celsius | | smartctl.device_power_cycles_count | power | cycles | +| smartctl.device_read_errors_rate | corrected, uncorrected | errors/s | +| smartctl.device_write_errors_rate | corrected, uncorrected | errors/s | +| smartctl.device_verify_errors_rate | corrected, uncorrected | errors/s | | smartctl.device_smart_attr_{attribute_name} | {attribute_name} | {attribute_unit} | | smartctl.device_smart_attr_{attribute_name}_normalized | {attribute_name} | value | From 5f4ce0c6098690a617b47189584c3f05f02d25f2 Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Fri, 12 Jul 2024 10:04:23 +0300 Subject: [PATCH 02/18] go.d whoisquery fix "days until" in config_schema.json (#18121) --- src/go/plugin/go.d/modules/whoisquery/config_schema.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/go/plugin/go.d/modules/whoisquery/config_schema.json b/src/go/plugin/go.d/modules/whoisquery/config_schema.json index e59fa8859848e8..fd3ef4955fef38 100644 --- a/src/go/plugin/go.d/modules/whoisquery/config_schema.json +++ b/src/go/plugin/go.d/modules/whoisquery/config_schema.json @@ -28,14 +28,14 @@ "description": "Number of days before the alarm status is set to warning.", "type": "integer", "minimum": 1, - "default": 90 + "default": 30 }, "days_until_expiration_critical": { "title": "Days until critical", "description": "Number of days before the alarm status is set to critical.", "type": "integer", "minimum": 1, - "default": 30 + "default": 15 } }, "required": [ From 1db23ce07fd4863db43a493f793bda07b05d4fbc Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Fri, 12 Jul 2024 13:05:20 +0300 Subject: [PATCH 03/18] go.d filecheck fix dir existence chart label (#18126) --- src/go/plugin/go.d/modules/filecheck/charts.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/go/plugin/go.d/modules/filecheck/charts.go b/src/go/plugin/go.d/modules/filecheck/charts.go index a03ebb9b7d11da..6d00463a6e3646 100644 --- a/src/go/plugin/go.d/modules/filecheck/charts.go +++ b/src/go/plugin/go.d/modules/filecheck/charts.go @@ -153,7 +153,7 @@ func (f *Filecheck) updateDirCharts(infos []*statInfo) { if !sd.hasExistenceCharts { sd.hasExistenceCharts = true - f.addFileCharts(info.path, + f.addDirCharts(info.path, dirExistenceStatusChartTmpl.Copy(), ) } From 078015b98234bfb59607144f91c86e7571a27c6e Mon Sep 17 00:00:00 2001 From: Costa Tsaousis Date: Fri, 12 Jul 2024 13:54:59 +0300 Subject: [PATCH 04/18] Spawn server fixes No 4 (#18127) add a magic number to all spawn server replies to ensure they are not corrupted --- src/libnetdata/spawn_server/spawn_server.c | 59 ++++++++++++++++++---- 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/src/libnetdata/spawn_server/spawn_server.c b/src/libnetdata/spawn_server/spawn_server.c index 80fd52cf7397e2..6031e5e02918ae 100644 --- a/src/libnetdata/spawn_server/spawn_server.c +++ b/src/libnetdata/spawn_server/spawn_server.c @@ -323,13 +323,17 @@ static int connect_to_spawn_server(const char *path, bool log) { // the child created by the spawn server typedef enum __attribute__((packed)) { + STATUS_REPORT_NONE = 0, STATUS_REPORT_STARTED, STATUS_REPORT_FAILED, STATUS_REPORT_EXITED, STATUS_REPORT_PING, } STATUS_REPORT; +#define STATUS_REPORT_MAGIC 0xBADA55EE + struct status_report { + uint32_t magic; STATUS_REPORT status; union { struct { @@ -346,17 +350,20 @@ struct status_report { }; }; -static void spawn_server_send_status_ping(int fd) { +static void spawn_server_send_status_ping(int sock) { struct status_report sr = { + .magic = STATUS_REPORT_MAGIC, .status = STATUS_REPORT_PING, }; - if(write(fd, &sr, sizeof(sr)) != sizeof(sr)) - nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN SERVER: Cannot send ping status report"); + if(write(sock, &sr, sizeof(sr)) != sizeof(sr)) + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN SERVER: Cannot send ping reply."); } static void spawn_server_send_status_success(SPAWN_REQUEST *rq) { const struct status_report sr = { + .magic = STATUS_REPORT_MAGIC, .status = STATUS_REPORT_STARTED, .started = { .pid = getpid(), @@ -364,11 +371,14 @@ static void spawn_server_send_status_success(SPAWN_REQUEST *rq) { }; if(write(rq->sock, &sr, sizeof(sr)) != sizeof(sr)) - nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN SERVER: Cannot send success status report"); + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN SERVER: Cannot send success status report for request %zu: %s", + rq->request_id, rq->cmdline); } static void spawn_server_send_status_failure(SPAWN_REQUEST *rq) { struct status_report sr = { + .magic = STATUS_REPORT_MAGIC, .status = STATUS_REPORT_FAILED, .failed = { .err_no = errno, @@ -376,11 +386,14 @@ static void spawn_server_send_status_failure(SPAWN_REQUEST *rq) { }; if(write(rq->sock, &sr, sizeof(sr)) != sizeof(sr)) - nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN SERVER: Cannot send failure status report"); + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN SERVER: Cannot send failure status report for request %zu: %s", + rq->request_id, rq->cmdline); } static void spawn_server_send_status_exit(SPAWN_REQUEST *rq, int waitpid_status) { struct status_report sr = { + .magic = STATUS_REPORT_MAGIC, .status = STATUS_REPORT_EXITED, .exited = { .waitpid_status = waitpid_status, @@ -388,7 +401,9 @@ static void spawn_server_send_status_exit(SPAWN_REQUEST *rq, int waitpid_status) }; if(write(rq->sock, &sr, sizeof(sr)) != sizeof(sr)) - nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN SERVER: Cannot send exit status report"); + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN SERVER: Cannot send exit status (%d) report for request %zu: %s", + waitpid_status, rq->request_id, rq->cmdline); } static void spawn_server_run_child(SPAWN_SERVER *server, SPAWN_REQUEST *rq) { @@ -1357,8 +1372,15 @@ int spawn_server_exec_wait(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE * // get the result struct status_report sr = { 0 }; if(read(instance->client_sock, &sr, sizeof(sr)) != sizeof(sr)) - nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN PARENT: failed to receive final status report for child %d, request %zu", instance->child_pid, instance->request_id); + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN PARENT: failed to read final status report for child %d, request %zu", + instance->child_pid, instance->request_id); + else if(sr.magic != STATUS_REPORT_MAGIC) { + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN PARENT: invalid final status report for child %d, request %zu (invalid magic %#x in response)", + instance->child_pid, instance->request_id, sr.magic); + } else switch(sr.status) { case STATUS_REPORT_EXITED: rc = sr.exited.waitpid_status; @@ -1368,7 +1390,9 @@ int spawn_server_exec_wait(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE * case STATUS_REPORT_FAILED: default: errno = 0; - nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN PARENT: invalid status report to exec spawn request %zu for pid %d (status = %u)", instance->request_id, instance->child_pid, sr.status); + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN PARENT: invalid status report to exec spawn request %zu for pid %d (status = %u)", + instance->request_id, instance->child_pid, sr.status); break; } @@ -1431,7 +1455,16 @@ SPAWN_INSTANCE* spawn_server_exec(SPAWN_SERVER *server, int stderr_fd, int custo struct status_report sr = { 0 }; if(read(instance->client_sock, &sr, sizeof(sr)) != sizeof(sr)) { - nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN PARENT: Failed to exec spawn request %zu (cannot get initial status report)", request.request_id); + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN PARENT: Failed to exec spawn request %zu (cannot get initial status report)", + request.request_id); + goto cleanup; + } + + if(sr.magic != STATUS_REPORT_MAGIC) { + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN PARENT: Failed to exec spawn request %zu (invalid magic %#x in response)", + request.request_id, sr.magic); goto cleanup; } @@ -1442,13 +1475,17 @@ SPAWN_INSTANCE* spawn_server_exec(SPAWN_SERVER *server, int stderr_fd, int custo case STATUS_REPORT_FAILED: errno = sr.failed.err_no; - nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN PARENT: Failed to exec spawn request %zu (check errno #1)", request.request_id); + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN PARENT: Failed to exec spawn request %zu (server reports failure, errno is updated)", + request.request_id); errno = 0; break; case STATUS_REPORT_EXITED: errno = ENOEXEC; - nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN PARENT: Failed to exec spawn request %zu (check errno #2)", request.request_id); + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN PARENT: Failed to exec spawn request %zu (server reports exit, errno is updated)", + request.request_id); errno = 0; break; From fa8c70489f5616caa9f52cc97488d4717f7eba2e Mon Sep 17 00:00:00 2001 From: Costa Tsaousis Date: Fri, 12 Jul 2024 21:23:05 +0300 Subject: [PATCH 05/18] Spawn server improvements 5 (#18131) spawn server feedback is always sent from the spawn server itself --- src/libnetdata/os/close_range.c | 23 +- src/libnetdata/os/close_range.h | 2 +- src/libnetdata/spawn_server/spawn_server.c | 409 +++++++++++---------- 3 files changed, 235 insertions(+), 199 deletions(-) diff --git a/src/libnetdata/os/close_range.c b/src/libnetdata/os/close_range.c index 10869adae40248..56d5c2527ad361 100644 --- a/src/libnetdata/os/close_range.c +++ b/src/libnetdata/os/close_range.c @@ -67,20 +67,31 @@ static int compare_ints(const void *a, const void *b) { return (int_a > int_b) - (int_a < int_b); } -void os_close_all_non_std_open_fds_except(int fds[], size_t fds_num) { +void os_close_all_non_std_open_fds_except(const int fds[], size_t fds_num) { if (fds_num == 0 || fds == NULL) { os_close_range(STDERR_FILENO + 1, CLOSE_RANGE_FD_MAX); return; } - qsort(fds, fds_num, sizeof(int), compare_ints); + // copy the fds array to ensure we will not alter them + int fds_copy[fds_num]; + memcpy(fds_copy, fds, sizeof(fds_copy)); + + qsort(fds_copy, fds_num, sizeof(int), compare_ints); int start = STDERR_FILENO + 1; - for (size_t i = 0; i < fds_num; i++) { - if (fds[i] > start) - os_close_range(start, fds[i] - 1); + size_t i = 0; + + // filter out all fds with a number smaller than our start + for (; i < fds_num; i++) + if(fds_copy[i] >= start) break; + + // call os_close_range() as many times as needed + for (; i < fds_num; i++) { + if (fds_copy[i] > start) + os_close_range(start, fds_copy[i] - 1); - start = fds[i] + 1; + start = fds_copy[i] + 1; } os_close_range(start, CLOSE_RANGE_FD_MAX); diff --git a/src/libnetdata/os/close_range.h b/src/libnetdata/os/close_range.h index 239b6cd46fda62..e3cb93798ac199 100644 --- a/src/libnetdata/os/close_range.h +++ b/src/libnetdata/os/close_range.h @@ -7,6 +7,6 @@ int os_get_fd_open_max(void); void os_close_range(int first, int last); -void os_close_all_non_std_open_fds_except(int fds[], size_t fds_num); +void os_close_all_non_std_open_fds_except(const int fds[], size_t fds_num); #endif //CLOSE_RANGE_H diff --git a/src/libnetdata/spawn_server/spawn_server.c b/src/libnetdata/spawn_server/spawn_server.c index 6031e5e02918ae..8e58e9ece3bb5b 100644 --- a/src/libnetdata/spawn_server/spawn_server.c +++ b/src/libnetdata/spawn_server/spawn_server.c @@ -23,7 +23,7 @@ struct spawn_server { // it is ignored for PING requests int pipe[2]; - int server_sock; + int sock; // the listening socket of the server pid_t server_pid; char *path; spawn_request_callback_t cb; @@ -35,7 +35,7 @@ struct spawn_server { struct spawm_instance { size_t request_id; - int client_sock; + int sock; int write_fd; int read_fd; pid_t child_pid; @@ -322,95 +322,9 @@ static int connect_to_spawn_server(const char *path, bool log) { // -------------------------------------------------------------------------------------------------------------------- // the child created by the spawn server -typedef enum __attribute__((packed)) { - STATUS_REPORT_NONE = 0, - STATUS_REPORT_STARTED, - STATUS_REPORT_FAILED, - STATUS_REPORT_EXITED, - STATUS_REPORT_PING, -} STATUS_REPORT; - -#define STATUS_REPORT_MAGIC 0xBADA55EE - -struct status_report { - uint32_t magic; - STATUS_REPORT status; - union { - struct { - pid_t pid; - } started; - - struct { - int err_no; - } failed; - - struct { - int waitpid_status; - } exited; - }; -}; - -static void spawn_server_send_status_ping(int sock) { - struct status_report sr = { - .magic = STATUS_REPORT_MAGIC, - .status = STATUS_REPORT_PING, - }; - - if(write(sock, &sr, sizeof(sr)) != sizeof(sr)) - nd_log(NDLS_COLLECTORS, NDLP_ERR, - "SPAWN SERVER: Cannot send ping reply."); -} - -static void spawn_server_send_status_success(SPAWN_REQUEST *rq) { - const struct status_report sr = { - .magic = STATUS_REPORT_MAGIC, - .status = STATUS_REPORT_STARTED, - .started = { - .pid = getpid(), - }, - }; - - if(write(rq->sock, &sr, sizeof(sr)) != sizeof(sr)) - nd_log(NDLS_COLLECTORS, NDLP_ERR, - "SPAWN SERVER: Cannot send success status report for request %zu: %s", - rq->request_id, rq->cmdline); -} - -static void spawn_server_send_status_failure(SPAWN_REQUEST *rq) { - struct status_report sr = { - .magic = STATUS_REPORT_MAGIC, - .status = STATUS_REPORT_FAILED, - .failed = { - .err_no = errno, - }, - }; - - if(write(rq->sock, &sr, sizeof(sr)) != sizeof(sr)) - nd_log(NDLS_COLLECTORS, NDLP_ERR, - "SPAWN SERVER: Cannot send failure status report for request %zu: %s", - rq->request_id, rq->cmdline); -} - -static void spawn_server_send_status_exit(SPAWN_REQUEST *rq, int waitpid_status) { - struct status_report sr = { - .magic = STATUS_REPORT_MAGIC, - .status = STATUS_REPORT_EXITED, - .exited = { - .waitpid_status = waitpid_status, - }, - }; - - if(write(rq->sock, &sr, sizeof(sr)) != sizeof(sr)) - nd_log(NDLS_COLLECTORS, NDLP_ERR, - "SPAWN SERVER: Cannot send exit status (%d) report for request %zu: %s", - waitpid_status, rq->request_id, rq->cmdline); -} - static void spawn_server_run_child(SPAWN_SERVER *server, SPAWN_REQUEST *rq) { - // fprintf(stderr, "CHILD: running request %zu on pid %d\n", request->request_id, getpid()); - // close the server sockets; - close(server->server_sock); server->server_sock = -1; + close(server->sock); server->sock = -1; if(server->pipe[0] != -1) { close(server->pipe[0]); server->pipe[0] = -1; } if(server->pipe[1] != -1) { close(server->pipe[1]); server->pipe[1] = -1; } @@ -421,6 +335,9 @@ static void spawn_server_run_child(SPAWN_SERVER *server, SPAWN_REQUEST *rq) { os_setproctitle(buf, server->argc, server->argv); } + // just a precausion in case we have any left-over fds + os_close_all_non_std_open_fds_except(rq->fds, SPAWN_SERVER_TRANSFER_FDS); + // get the fds from the request int stdin_fd = rq->fds[0]; int stdout_fd = rq->fds[1]; @@ -429,15 +346,21 @@ static void spawn_server_run_child(SPAWN_SERVER *server, SPAWN_REQUEST *rq) { // change stdio fds to the ones in the request if (dup2(stdin_fd, STDIN_FILENO) == -1) { - spawn_server_send_status_failure(rq); + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN SERVER: cannot dup2(%d) stdin of request No %zu: %s", + stdin_fd, rq->request_id, rq->cmdline); exit(1); } if (dup2(stdout_fd, STDOUT_FILENO) == -1) { - spawn_server_send_status_failure(rq); + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN SERVER: cannot dup2(%d) stdin of request No %zu: %s", + stdout_fd, rq->request_id, rq->cmdline); exit(1); } if (dup2(stderr_fd, STDERR_FILENO) == -1) { - spawn_server_send_status_failure(rq); + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN SERVER: cannot dup2(%d) stderr of request No %zu: %s", + stderr_fd, rq->request_id, rq->cmdline); exit(1); } @@ -453,25 +376,15 @@ static void spawn_server_run_child(SPAWN_SERVER *server, SPAWN_REQUEST *rq) { switch (rq->type) { case SPAWN_INSTANCE_TYPE_EXEC: - spawn_server_send_status_success(rq); - close(rq->sock); rq->sock = -1; - close(custom_fd); custom_fd = -1; + if(custom_fd != -1) { close(custom_fd); custom_fd = -1; } execvp(rq->argv[0], (char **)rq->argv); nd_log(NDLS_COLLECTORS, NDLP_ERR, - "SPAWN SERVER: Failed to execute command of request No %zu (argv[0] = '%s')", - rq->request_id, rq->argv[0]); + "SPAWN SERVER: Failed to execute command of request No %zu: %s", + rq->request_id, rq->cmdline); exit(1); break; case SPAWN_INSTANCE_TYPE_CALLBACK: - if(server->cb == NULL) { - errno = ENOENT; - spawn_server_send_status_failure(rq); - close(rq->sock); rq->sock = -1; - exit(1); - } - spawn_server_send_status_success(rq); - close(rq->sock); rq->sock = -1; server->cb(rq); exit(0); break; @@ -592,6 +505,177 @@ static BUFFER *argv_to_cmdline_buffer(const char **argv) { return wb; } +// -------------------------------------------------------------------------------------------------------------------- +// status reports + +typedef enum __attribute__((packed)) { + STATUS_REPORT_NONE = 0, + STATUS_REPORT_STARTED, + STATUS_REPORT_FAILED, + STATUS_REPORT_EXITED, + STATUS_REPORT_PING, +} STATUS_REPORT; + +#define STATUS_REPORT_MAGIC 0xBADA55EE + +struct status_report { + uint32_t magic; + STATUS_REPORT status; + union { + struct { + pid_t pid; + } started; + + struct { + int err_no; + } failed; + + struct { + int waitpid_status; + } exited; + }; +}; + +static void spawn_server_send_status_ping(int sock) { + struct status_report sr = { + .magic = STATUS_REPORT_MAGIC, + .status = STATUS_REPORT_PING, + }; + + if(write(sock, &sr, sizeof(sr)) != sizeof(sr)) + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN SERVER: Cannot send ping reply."); +} + +static void spawn_server_send_status_success(SPAWN_REQUEST *rq) { + const struct status_report sr = { + .magic = STATUS_REPORT_MAGIC, + .status = STATUS_REPORT_STARTED, + .started = { + .pid = rq->pid, + }, + }; + + if(write(rq->sock, &sr, sizeof(sr)) != sizeof(sr)) + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN SERVER: Cannot send success status report for pid %d, request %zu: %s", + rq->pid, rq->request_id, rq->cmdline); +} + +static void spawn_server_send_status_failure(SPAWN_REQUEST *rq) { + struct status_report sr = { + .magic = STATUS_REPORT_MAGIC, + .status = STATUS_REPORT_FAILED, + .failed = { + .err_no = errno, + }, + }; + + if(write(rq->sock, &sr, sizeof(sr)) != sizeof(sr)) + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN SERVER: Cannot send failure status report for request %zu: %s", + rq->request_id, rq->cmdline); +} + +static void spawn_server_send_status_exit(SPAWN_REQUEST *rq, int waitpid_status) { + struct status_report sr = { + .magic = STATUS_REPORT_MAGIC, + .status = STATUS_REPORT_EXITED, + .exited = { + .waitpid_status = waitpid_status, + }, + }; + + if(write(rq->sock, &sr, sizeof(sr)) != sizeof(sr)) + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN SERVER: Cannot send exit status (%d) report for pid %d, request %zu: %s", + waitpid_status, rq->pid, rq->request_id, rq->cmdline); +} + +// -------------------------------------------------------------------------------------------------------------------- +// execute a received request + +static void request_free(SPAWN_REQUEST *rq) { + if(rq->fds[0] != -1) close(rq->fds[0]); + if(rq->fds[1] != -1) close(rq->fds[1]); + if(rq->fds[2] != -1) close(rq->fds[2]); + if(rq->fds[3] != -1) close(rq->fds[3]); + if(rq->sock != -1) close(rq->sock); + freez((void *)rq->argv); + freez((void *)rq->environment); + freez((void *)rq->data); + freez((void *)rq->cmdline); + freez((void *)rq); +} + +static void spawn_server_execute_request(SPAWN_SERVER *server, SPAWN_REQUEST *rq) { + switch(rq->type) { + case SPAWN_INSTANCE_TYPE_EXEC: + // close custom_fd - it is not needed for exec mode + if(rq->fds[3] != -1) { close(rq->fds[3]); rq->fds[3] = -1; } + + // create the cmdline for logs + if(rq->argv) { + CLEAN_BUFFER *wb = argv_to_cmdline_buffer(rq->argv); + rq->cmdline = strdupz(buffer_tostring(wb)); + } + break; + + case SPAWN_INSTANCE_TYPE_CALLBACK: + if(server->cb == NULL) { + errno = ENOSYS; + spawn_server_send_status_failure(rq); + request_free(rq); + return; + } + rq->cmdline = strdupz("callback() function"); + break; + + default: + errno = EINVAL; + spawn_server_send_status_failure(rq); + request_free(rq); + return; + } + + pid_t pid = fork(); + if (pid < 0) { + // fork failed + + nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN SERVER: Failed to fork() child."); + spawn_server_send_status_failure(rq); + request_free(rq); + return; + } + else if (pid == 0) { + // the child + + spawn_server_run_child(server, rq); + exit(63); + } + + // the parent + rq->pid = pid; + + // let the parent know + spawn_server_send_status_success(rq); + + // do not keep data we don't need at the parent + freez((void *)rq->environment); rq->environment = NULL; + freez((void *)rq->argv); rq->argv = NULL; + freez((void *)rq->data); rq->data = NULL; + rq->data_size = 0; + + // do not keep fds we don't need at the parent + if(rq->fds[0] != -1) { close(rq->fds[0]); rq->fds[0] = -1; } + if(rq->fds[1] != -1) { close(rq->fds[1]); rq->fds[1] = -1; } + if(rq->fds[2] != -1) { close(rq->fds[2]); rq->fds[2] = -1; } + if(rq->fds[3] != -1) { close(rq->fds[3]); rq->fds[3] = -1; } + + // keep it in the list + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(spawn_server_requests, rq, prev, next); +} + // -------------------------------------------------------------------------------------------------------------------- // Sending and receiving requests @@ -747,75 +831,6 @@ static bool spawn_server_send_request(ND_UUID *magic, SPAWN_REQUEST *request) { return ret; } -static void request_free(SPAWN_REQUEST *rq) { - if(rq->fds[0] != -1) close(rq->fds[0]); - if(rq->fds[1] != -1) close(rq->fds[1]); - if(rq->fds[2] != -1) close(rq->fds[2]); - if(rq->fds[3] != -1) close(rq->fds[3]); - if(rq->sock != -1) close(rq->sock); - freez((void *)rq->argv); - freez((void *)rq->environment); - freez((void *)rq->data); - freez((void *)rq->cmdline); - freez((void *)rq); -} - -static void spawn_server_execute_request(SPAWN_SERVER *server, SPAWN_REQUEST *rq) { - switch(rq->type) { - case SPAWN_INSTANCE_TYPE_EXEC: - if(rq->argv) { - CLEAN_BUFFER *wb = argv_to_cmdline_buffer(rq->argv); - rq->cmdline = strdupz(buffer_tostring(wb)); - } - break; - - case SPAWN_INSTANCE_TYPE_CALLBACK: - rq->cmdline = strdupz("callback() function"); - break; - - default: - rq->cmdline = strdupz("[unknown request type]"); - break; - } - - pid_t pid = fork(); - if (pid < 0) { - // fork failed - - nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN SERVER: Failed to fork() child."); - spawn_server_send_status_failure(rq); - request_free(rq); - return; - } - else if (pid == 0) { - // the child - - spawn_server_run_child(server, rq); - exit(63); - } - - // the parent - rq->pid = pid; - - // do not keep data we don't need at the parent - freez((void *)rq->environment); rq->environment = NULL; - freez((void *)rq->argv); rq->argv = NULL; - freez((void *)rq->data); rq->data = NULL; - rq->data_size = 0; - - // do not keep fds we don't need at the parent - if(rq->fds[0] != -1) { close(rq->fds[0]); rq->fds[0] = -1; } - if(rq->fds[1] != -1) { close(rq->fds[1]); rq->fds[1] = -1; } - if(rq->fds[2] != -1) { close(rq->fds[2]); rq->fds[2] = -1; } - if(rq->fds[3] != -1) { close(rq->fds[3]); rq->fds[3] = -1; } - - // keep it in the list - DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(spawn_server_requests, rq, prev, next); - - // do not fork this socket on other children - sock_setcloexec(rq->sock); -} - static void spawn_server_receive_request(int sock, SPAWN_SERVER *server) { struct msghdr msg = {0}; struct iovec iov[7]; @@ -1110,7 +1125,7 @@ static void spawn_server_event_loop(SPAWN_SERVER *server) { } struct pollfd fds[2]; - fds[0].fd = server->server_sock; + fds[0].fd = server->sock; fds[0].events = POLLIN; fds[1].fd = pipe_fd; fds[1].events = POLLHUP | POLLERR; @@ -1137,13 +1152,17 @@ static void spawn_server_event_loop(SPAWN_SERVER *server) { } if (fds[0].revents & POLLIN) { - int client_sock = accept(server->server_sock, NULL, NULL); - if (client_sock == -1) { + int sock = accept(server->sock, NULL, NULL); + if (sock == -1) { nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN SERVER: accept() failed"); continue; } - spawn_server_receive_request(client_sock, server); + // do not fork this socket + sock_setcloexec(sock); + + // receive the request and process it + spawn_server_receive_request(sock, server); } } @@ -1174,7 +1193,7 @@ static void spawn_server_event_loop(SPAWN_SERVER *server) { void spawn_server_destroy(SPAWN_SERVER *server) { if(server->pipe[0] != -1) close(server->pipe[0]); if(server->pipe[1] != -1) close(server->pipe[1]); - if(server->server_sock != -1) close(server->server_sock); + if(server->sock != -1) close(server->sock); if(server->server_pid) { kill(server->server_pid, SIGTERM); @@ -1196,7 +1215,7 @@ static bool spawn_server_create_listening_socket(SPAWN_SERVER *server) { return false; } - if ((server->server_sock = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { + if ((server->sock = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN SERVER: Failed to create socket()"); return false; } @@ -1208,12 +1227,12 @@ static bool spawn_server_create_listening_socket(SPAWN_SERVER *server) { unlink(server->path); errno = 0; - if (bind(server->server_sock, (struct sockaddr *)&server_addr, sizeof(server_addr)) == -1) { + if (bind(server->sock, (struct sockaddr *)&server_addr, sizeof(server_addr)) == -1) { nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN SERVER: Failed to bind()"); return false; } - if (listen(server->server_sock, 5) == -1) { + if (listen(server->sock, 5) == -1) { nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN SERVER: Failed to listen()"); return false; } @@ -1250,7 +1269,7 @@ SPAWN_SERVER* spawn_server_create(SPAWN_SERVER_OPTIONS options, const char *name SPAWN_SERVER *server = callocz(1, sizeof(SPAWN_SERVER)); server->pipe[0] = -1; server->pipe[1] = -1; - server->server_sock = -1; + server->sock = -1; server->cb = child_callback; server->argc = argc; server->argv = argv; @@ -1309,6 +1328,7 @@ SPAWN_SERVER* spawn_server_create(SPAWN_SERVER_OPTIONS options, const char *name pid_t pid = fork(); if (pid == 0) { // the child - the spawn server + { char buf[15]; snprintfz(buf, sizeof(buf), "spawn-%s", server->name); @@ -1316,13 +1336,13 @@ SPAWN_SERVER* spawn_server_create(SPAWN_SERVER_OPTIONS options, const char *name } replace_stdio_with_dev_null(); - os_close_all_non_std_open_fds_except((int[]){ server->server_sock, server->pipe[1] }, 2); + os_close_all_non_std_open_fds_except((int[]){ server->sock, server->pipe[1] }, 2); spawn_server_event_loop(server); } else if (pid > 0) { // the parent server->server_pid = pid; - close(server->server_sock); server->server_sock = -1; + close(server->sock); server->sock = -1; close(server->pipe[1]); server->pipe[1] = -1; struct status_report sr = { 0 }; @@ -1358,7 +1378,7 @@ void spawn_server_exec_destroy(SPAWN_INSTANCE *instance) { if(instance->child_pid) kill(instance->child_pid, SIGTERM); if(instance->write_fd != -1) close(instance->write_fd); if(instance->read_fd != -1) close(instance->read_fd); - if(instance->client_sock != -1) close(instance->client_sock); + if(instance->sock != -1) close(instance->sock); freez(instance); } @@ -1371,7 +1391,7 @@ int spawn_server_exec_wait(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE * // get the result struct status_report sr = { 0 }; - if(read(instance->client_sock, &sr, sizeof(sr)) != sizeof(sr)) + if(read(instance->sock, &sr, sizeof(sr)) != sizeof(sr)) nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN PARENT: failed to read final status report for child %d, request %zu", instance->child_pid, instance->request_id); @@ -1414,8 +1434,8 @@ SPAWN_INSTANCE* spawn_server_exec(SPAWN_SERVER *server, int stderr_fd, int custo instance->read_fd = -1; instance->write_fd = -1; - instance->client_sock = connect_to_spawn_server(server->path, true); - if(instance->client_sock == -1) + instance->sock = connect_to_spawn_server(server->path, true); + if(instance->sock == -1) goto cleanup; if (pipe(pipe_stdin) == -1) { @@ -1430,7 +1450,7 @@ SPAWN_INSTANCE* spawn_server_exec(SPAWN_SERVER *server, int stderr_fd, int custo SPAWN_REQUEST request = { .request_id = __atomic_add_fetch(&server->request_id, 1, __ATOMIC_RELAXED), - .sock = instance->client_sock, + .sock = instance->sock, .fds = { [0] = pipe_stdin[0], [1] = pipe_stdout[1], @@ -1453,8 +1473,11 @@ SPAWN_INSTANCE* spawn_server_exec(SPAWN_SERVER *server, int stderr_fd, int custo close(pipe_stdout[1]); pipe_stdout[1] = -1; instance->read_fd = pipe_stdout[0]; pipe_stdout[0] = -1; + // copy the request id to the instance + instance->request_id = request.request_id; + struct status_report sr = { 0 }; - if(read(instance->client_sock, &sr, sizeof(sr)) != sizeof(sr)) { + if(read(instance->sock, &sr, sizeof(sr)) != sizeof(sr)) { nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN PARENT: Failed to exec spawn request %zu (cannot get initial status report)", request.request_id); @@ -1491,7 +1514,9 @@ SPAWN_INSTANCE* spawn_server_exec(SPAWN_SERVER *server, int stderr_fd, int custo default: errno = 0; - nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN PARENT: Invalid status report to exec spawn request %zu (received invalid data)", request.request_id); + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN PARENT: Invalid status report to exec spawn request %zu (received invalid data)", + request.request_id); break; } From 28de8a8c71925b63782ad41ab1f3bda176cb89fd Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Fri, 12 Jul 2024 22:44:06 +0300 Subject: [PATCH 06/18] go.d fix url path overwrite (#18132) --- src/go/plugin/go.d/modules/consul/collect.go | 3 +- .../plugin/go.d/modules/couchbase/collect.go | 8 ++-- src/go/plugin/go.d/modules/couchdb/collect.go | 20 ++++----- src/go/plugin/go.d/modules/dnsdist/collect.go | 6 ++- .../go.d/modules/elasticsearch/collect.go | 17 ++++--- src/go/plugin/go.d/modules/ipfs/collect.go | 16 ++----- .../plugin/go.d/modules/logstash/collect.go | 6 ++- .../modules/nginxplus/nginx_http_api_query.go | 45 +++++++------------ src/go/plugin/go.d/modules/pihole/collect.go | 12 ++--- .../plugin/go.d/modules/powerdns/collect.go | 3 +- .../go.d/modules/powerdns_recursor/collect.go | 3 +- src/go/plugin/go.d/modules/puppet/collect.go | 3 +- .../plugin/go.d/modules/rabbitmq/collect.go | 4 +- src/go/plugin/go.d/modules/rspamd/collect.go | 4 +- src/go/plugin/go.d/pkg/web/request.go | 13 ++++++ src/go/plugin/go.d/pkg/web/request_test.go | 28 ++++++++++++ 16 files changed, 99 insertions(+), 92 deletions(-) diff --git a/src/go/plugin/go.d/modules/consul/collect.go b/src/go/plugin/go.d/modules/consul/collect.go index 565ad649c4d44c..3033e046e3bebb 100644 --- a/src/go/plugin/go.d/modules/consul/collect.go +++ b/src/go/plugin/go.d/modules/consul/collect.go @@ -69,12 +69,11 @@ func (c *Consul) isServer() bool { } func (c *Consul) doOKDecode(urlPath string, in interface{}, statusCodes ...int) error { - req, err := web.NewHTTPRequest(c.Request.Copy()) + req, err := web.NewHTTPRequestWithPath(c.Request, urlPath) if err != nil { return fmt.Errorf("error on creating request: %v", err) } - req.URL.Path = urlPath if c.ACLToken != "" { req.Header.Set("X-Consul-Token", c.ACLToken) } diff --git a/src/go/plugin/go.d/modules/couchbase/collect.go b/src/go/plugin/go.d/modules/couchbase/collect.go index d14ab722868789..6027ac918a47c1 100644 --- a/src/go/plugin/go.d/modules/couchbase/collect.go +++ b/src/go/plugin/go.d/modules/couchbase/collect.go @@ -112,11 +112,13 @@ func (cb *Couchbase) addDimToChart(chartID string, dim *module.Dim) { } func (cb *Couchbase) scrapeCouchbase() (*cbMetrics, error) { - ms := &cbMetrics{} - req, _ := web.NewHTTPRequest(cb.Request) - req.URL.Path = urlPathBucketsStats + req, err := web.NewHTTPRequestWithPath(cb.Request, urlPathBucketsStats) + if err != nil { + return nil, err + } req.URL.RawQuery = url.Values{"skipMap": []string{"true"}}.Encode() + ms := &cbMetrics{} if err := cb.doOKDecode(req, &ms.BucketsBasicStats); err != nil { return nil, err } diff --git a/src/go/plugin/go.d/modules/couchdb/collect.go b/src/go/plugin/go.d/modules/couchdb/collect.go index 1ccebfaeac3bc1..21b38fb3a60df8 100644 --- a/src/go/plugin/go.d/modules/couchdb/collect.go +++ b/src/go/plugin/go.d/modules/couchdb/collect.go @@ -120,8 +120,7 @@ func (cdb *CouchDB) scrapeCouchDB() *cdbMetrics { } func (cdb *CouchDB) scrapeNodeStats(ms *cdbMetrics) { - req, _ := web.NewHTTPRequest(cdb.Request) - req.URL.Path = fmt.Sprintf(urlPathOverviewStats, cdb.Config.Node) + req, _ := web.NewHTTPRequestWithPath(cdb.Request, fmt.Sprintf(urlPathOverviewStats, cdb.Config.Node)) var stats cdbNodeStats if err := cdb.doOKDecode(req, &stats); err != nil { @@ -132,8 +131,7 @@ func (cdb *CouchDB) scrapeNodeStats(ms *cdbMetrics) { } func (cdb *CouchDB) scrapeSystemStats(ms *cdbMetrics) { - req, _ := web.NewHTTPRequest(cdb.Request) - req.URL.Path = fmt.Sprintf(urlPathSystemStats, cdb.Config.Node) + req, _ := web.NewHTTPRequestWithPath(cdb.Request, fmt.Sprintf(urlPathSystemStats, cdb.Config.Node)) var stats cdbNodeSystem if err := cdb.doOKDecode(req, &stats); err != nil { @@ -144,8 +142,7 @@ func (cdb *CouchDB) scrapeSystemStats(ms *cdbMetrics) { } func (cdb *CouchDB) scrapeActiveTasks(ms *cdbMetrics) { - req, _ := web.NewHTTPRequest(cdb.Request) - req.URL.Path = urlPathActiveTasks + req, _ := web.NewHTTPRequestWithPath(cdb.Request, urlPathActiveTasks) var stats []cdbActiveTask if err := cdb.doOKDecode(req, &stats); err != nil { @@ -156,8 +153,7 @@ func (cdb *CouchDB) scrapeActiveTasks(ms *cdbMetrics) { } func (cdb *CouchDB) scrapeDBStats(ms *cdbMetrics) { - req, _ := web.NewHTTPRequest(cdb.Request) - req.URL.Path = urlPathDatabases + req, _ := web.NewHTTPRequestWithPath(cdb.Request, urlPathDatabases) req.Method = http.MethodPost req.Header.Add("Accept", "application/json") req.Header.Add("Content-Type", "application/json") @@ -182,18 +178,18 @@ func (cdb *CouchDB) scrapeDBStats(ms *cdbMetrics) { } func findMaxMQSize(MessageQueues map[string]interface{}) int64 { - var max float64 + var maxSize float64 for _, mq := range MessageQueues { switch mqSize := mq.(type) { case float64: - max = math.Max(max, mqSize) + maxSize = math.Max(maxSize, mqSize) case map[string]interface{}: if v, ok := mqSize["count"].(float64); ok { - max = math.Max(max, v) + maxSize = math.Max(maxSize, v) } } } - return int64(max) + return int64(maxSize) } func (cdb *CouchDB) pingCouchDB() error { diff --git a/src/go/plugin/go.d/modules/dnsdist/collect.go b/src/go/plugin/go.d/modules/dnsdist/collect.go index 719defa369acf1..9b860abf4e5989 100644 --- a/src/go/plugin/go.d/modules/dnsdist/collect.go +++ b/src/go/plugin/go.d/modules/dnsdist/collect.go @@ -36,8 +36,10 @@ func (d *DNSdist) collectStatistic(collected map[string]int64, statistics *stati } func (d *DNSdist) scrapeStatistics() (*statisticMetrics, error) { - req, _ := web.NewHTTPRequest(d.Request) - req.URL.Path = urlPathJSONStat + req, err := web.NewHTTPRequestWithPath(d.Request, urlPathJSONStat) + if err != nil { + return nil, err + } req.URL.RawQuery = url.Values{"command": []string{"stats"}}.Encode() var statistics statisticMetrics diff --git a/src/go/plugin/go.d/modules/elasticsearch/collect.go b/src/go/plugin/go.d/modules/elasticsearch/collect.go index 5e3ab8c217bd88..4f46f1088fa509 100644 --- a/src/go/plugin/go.d/modules/elasticsearch/collect.go +++ b/src/go/plugin/go.d/modules/elasticsearch/collect.go @@ -158,13 +158,15 @@ func (es *Elasticsearch) scrapeElasticsearch() *esMetrics { } func (es *Elasticsearch) scrapeNodesStats(ms *esMetrics) { - req, _ := web.NewHTTPRequest(es.Request) + var p string if es.ClusterMode { - req.URL.Path = urlPathNodesStats + p = urlPathNodesStats } else { - req.URL.Path = urlPathLocalNodeStats + p = urlPathLocalNodeStats } + req, _ := web.NewHTTPRequestWithPath(es.Request, p) + var stats esNodesStats if err := es.doOKDecode(req, &stats); err != nil { es.Warning(err) @@ -175,8 +177,7 @@ func (es *Elasticsearch) scrapeNodesStats(ms *esMetrics) { } func (es *Elasticsearch) scrapeClusterHealth(ms *esMetrics) { - req, _ := web.NewHTTPRequest(es.Request) - req.URL.Path = urlPathClusterHealth + req, _ := web.NewHTTPRequestWithPath(es.Request, urlPathClusterHealth) var health esClusterHealth if err := es.doOKDecode(req, &health); err != nil { @@ -188,8 +189,7 @@ func (es *Elasticsearch) scrapeClusterHealth(ms *esMetrics) { } func (es *Elasticsearch) scrapeClusterStats(ms *esMetrics) { - req, _ := web.NewHTTPRequest(es.Request) - req.URL.Path = urlPathClusterStats + req, _ := web.NewHTTPRequestWithPath(es.Request, urlPathClusterStats) var stats esClusterStats if err := es.doOKDecode(req, &stats); err != nil { @@ -201,8 +201,7 @@ func (es *Elasticsearch) scrapeClusterStats(ms *esMetrics) { } func (es *Elasticsearch) scrapeLocalIndicesStats(ms *esMetrics) { - req, _ := web.NewHTTPRequest(es.Request) - req.URL.Path = urlPathIndicesStats + req, _ := web.NewHTTPRequestWithPath(es.Request, urlPathIndicesStats) req.URL.RawQuery = "local=true&format=json" var stats []esIndexStats diff --git a/src/go/plugin/go.d/modules/ipfs/collect.go b/src/go/plugin/go.d/modules/ipfs/collect.go index 930a5a1d8152c1..6bd0b128a3bcb4 100644 --- a/src/go/plugin/go.d/modules/ipfs/collect.go +++ b/src/go/plugin/go.d/modules/ipfs/collect.go @@ -125,13 +125,11 @@ func (ip *IPFS) collectPinLs(mx map[string]int64) error { } func (ip *IPFS) queryStatsBandwidth() (*ipfsStatsBw, error) { - req, err := web.NewHTTPRequest(ip.Request) + req, err := web.NewHTTPRequestWithPath(ip.Request, urlPathStatsBandwidth) if err != nil { return nil, err } - req.URL.Path = urlPathStatsBandwidth - var stats ipfsStatsBw if err := ip.doOKDecode(req, &stats); err != nil { return nil, err @@ -145,13 +143,11 @@ func (ip *IPFS) queryStatsBandwidth() (*ipfsStatsBw, error) { } func (ip *IPFS) querySwarmPeers() (*ipfsSwarmPeers, error) { - req, err := web.NewHTTPRequest(ip.Request) + req, err := web.NewHTTPRequestWithPath(ip.Request, urlPathSwarmPeers) if err != nil { return nil, err } - req.URL.Path = urlPathSwarmPeers - var stats ipfsSwarmPeers if err := ip.doOKDecode(req, &stats); err != nil { return nil, err @@ -161,13 +157,11 @@ func (ip *IPFS) querySwarmPeers() (*ipfsSwarmPeers, error) { } func (ip *IPFS) queryStatsRepo() (*ipfsStatsRepo, error) { - req, err := web.NewHTTPRequest(ip.Request) + req, err := web.NewHTTPRequestWithPath(ip.Request, urlPathStatsRepo) if err != nil { return nil, err } - req.URL.Path = urlPathStatsRepo - var stats ipfsStatsRepo if err := ip.doOKDecode(req, &stats); err != nil { return nil, err @@ -177,13 +171,11 @@ func (ip *IPFS) queryStatsRepo() (*ipfsStatsRepo, error) { } func (ip *IPFS) queryPinLs() (*ipfsPinsLs, error) { - req, err := web.NewHTTPRequest(ip.Request) + req, err := web.NewHTTPRequestWithPath(ip.Request, urlPathPinLs) if err != nil { return nil, err } - req.URL.Path = urlPathPinLs - var stats ipfsPinsLs if err := ip.doOKDecode(req, &stats); err != nil { return nil, err diff --git a/src/go/plugin/go.d/modules/logstash/collect.go b/src/go/plugin/go.d/modules/logstash/collect.go index b2f4275ce49585..ff506d64086145 100644 --- a/src/go/plugin/go.d/modules/logstash/collect.go +++ b/src/go/plugin/go.d/modules/logstash/collect.go @@ -45,8 +45,10 @@ func (l *Logstash) updateCharts(pipelines map[string]pipelineStats) { } func (l *Logstash) queryNodeStats() (*nodeStats, error) { - req, _ := web.NewHTTPRequest(l.Request.Copy()) - req.URL.Path = urlPathNodeStatsAPI + req, err := web.NewHTTPRequestWithPath(l.Request, urlPathNodeStatsAPI) + if err != nil { + return nil, err + } var stats nodeStats diff --git a/src/go/plugin/go.d/modules/nginxplus/nginx_http_api_query.go b/src/go/plugin/go.d/modules/nginxplus/nginx_http_api_query.go index 9ee4d62eb0206b..b54cd142a6a025 100644 --- a/src/go/plugin/go.d/modules/nginxplus/nginx_http_api_query.go +++ b/src/go/plugin/go.d/modules/nginxplus/nginx_http_api_query.go @@ -46,8 +46,7 @@ type nginxMetrics struct { } func (n *NginxPlus) queryAPIVersion() (int64, error) { - req, _ := web.NewHTTPRequest(n.Request.Copy()) - req.URL.Path = urlPathAPIVersions + req, _ := web.NewHTTPRequestWithPath(n.Request, urlPathAPIVersions) var versions nginxAPIVersions if err := n.doWithDecode(&versions, req); err != nil { @@ -62,8 +61,7 @@ func (n *NginxPlus) queryAPIVersion() (int64, error) { } func (n *NginxPlus) queryAvailableEndpoints() error { - req, _ := web.NewHTTPRequest(n.Request.Copy()) - req.URL.Path = fmt.Sprintf(urlPathAPIEndpointsRoot, n.apiVersion) + req, _ := web.NewHTTPRequestWithPath(n.Request, fmt.Sprintf(urlPathAPIEndpointsRoot, n.apiVersion)) var endpoints []string if err := n.doWithDecode(&endpoints, req); err != nil { @@ -91,8 +89,7 @@ func (n *NginxPlus) queryAvailableEndpoints() error { if hasHTTP { endpoints = endpoints[:0] - req, _ = web.NewHTTPRequest(n.Request.Copy()) - req.URL.Path = fmt.Sprintf(urlPathAPIEndpointsHTTP, n.apiVersion) + req, _ = web.NewHTTPRequestWithPath(n.Request, fmt.Sprintf(urlPathAPIEndpointsHTTP, n.apiVersion)) if err := n.doWithDecode(&endpoints, req); err != nil { return err @@ -117,8 +114,7 @@ func (n *NginxPlus) queryAvailableEndpoints() error { if hasStream { endpoints = endpoints[:0] - req, _ = web.NewHTTPRequest(n.Request.Copy()) - req.URL.Path = fmt.Sprintf(urlPathAPIEndpointsStream, n.apiVersion) + req, _ = web.NewHTTPRequestWithPath(n.Request, fmt.Sprintf(urlPathAPIEndpointsStream, n.apiVersion)) if err := n.doWithDecode(&endpoints, req); err != nil { return err @@ -171,8 +167,7 @@ func (n *NginxPlus) queryMetrics() *nginxMetrics { } func (n *NginxPlus) queryNginxInfo(ms *nginxMetrics) { - req, _ := web.NewHTTPRequest(n.Request.Copy()) - req.URL.Path = fmt.Sprintf(urlPathAPINginx, n.apiVersion) + req, _ := web.NewHTTPRequestWithPath(n.Request, fmt.Sprintf(urlPathAPINginx, n.apiVersion)) var v nginxInfo @@ -186,8 +181,7 @@ func (n *NginxPlus) queryNginxInfo(ms *nginxMetrics) { } func (n *NginxPlus) queryConnections(ms *nginxMetrics) { - req, _ := web.NewHTTPRequest(n.Request.Copy()) - req.URL.Path = fmt.Sprintf(urlPathAPIConnections, n.apiVersion) + req, _ := web.NewHTTPRequestWithPath(n.Request, fmt.Sprintf(urlPathAPIConnections, n.apiVersion)) var v nginxConnections @@ -201,8 +195,7 @@ func (n *NginxPlus) queryConnections(ms *nginxMetrics) { } func (n *NginxPlus) querySSL(ms *nginxMetrics) { - req, _ := web.NewHTTPRequest(n.Request.Copy()) - req.URL.Path = fmt.Sprintf(urlPathAPISSL, n.apiVersion) + req, _ := web.NewHTTPRequestWithPath(n.Request, fmt.Sprintf(urlPathAPISSL, n.apiVersion)) var v nginxSSL @@ -216,8 +209,7 @@ func (n *NginxPlus) querySSL(ms *nginxMetrics) { } func (n *NginxPlus) queryHTTPRequests(ms *nginxMetrics) { - req, _ := web.NewHTTPRequest(n.Request.Copy()) - req.URL.Path = fmt.Sprintf(urlPathAPIHTTPRequests, n.apiVersion) + req, _ := web.NewHTTPRequestWithPath(n.Request, fmt.Sprintf(urlPathAPIHTTPRequests, n.apiVersion)) var v nginxHTTPRequests @@ -231,8 +223,7 @@ func (n *NginxPlus) queryHTTPRequests(ms *nginxMetrics) { } func (n *NginxPlus) queryHTTPServerZones(ms *nginxMetrics) { - req, _ := web.NewHTTPRequest(n.Request.Copy()) - req.URL.Path = fmt.Sprintf(urlPathAPIHTTPServerZones, n.apiVersion) + req, _ := web.NewHTTPRequestWithPath(n.Request, fmt.Sprintf(urlPathAPIHTTPServerZones, n.apiVersion)) var v nginxHTTPServerZones @@ -246,8 +237,7 @@ func (n *NginxPlus) queryHTTPServerZones(ms *nginxMetrics) { } func (n *NginxPlus) queryHTTPLocationZones(ms *nginxMetrics) { - req, _ := web.NewHTTPRequest(n.Request.Copy()) - req.URL.Path = fmt.Sprintf(urlPathAPIHTTPLocationZones, n.apiVersion) + req, _ := web.NewHTTPRequestWithPath(n.Request, fmt.Sprintf(urlPathAPIHTTPLocationZones, n.apiVersion)) var v nginxHTTPLocationZones @@ -261,8 +251,7 @@ func (n *NginxPlus) queryHTTPLocationZones(ms *nginxMetrics) { } func (n *NginxPlus) queryHTTPUpstreams(ms *nginxMetrics) { - req, _ := web.NewHTTPRequest(n.Request.Copy()) - req.URL.Path = fmt.Sprintf(urlPathAPIHTTPUpstreams, n.apiVersion) + req, _ := web.NewHTTPRequestWithPath(n.Request, fmt.Sprintf(urlPathAPIHTTPUpstreams, n.apiVersion)) var v nginxHTTPUpstreams @@ -276,8 +265,7 @@ func (n *NginxPlus) queryHTTPUpstreams(ms *nginxMetrics) { } func (n *NginxPlus) queryHTTPCaches(ms *nginxMetrics) { - req, _ := web.NewHTTPRequest(n.Request.Copy()) - req.URL.Path = fmt.Sprintf(urlPathAPIHTTPCaches, n.apiVersion) + req, _ := web.NewHTTPRequestWithPath(n.Request, fmt.Sprintf(urlPathAPIHTTPCaches, n.apiVersion)) var v nginxHTTPCaches @@ -291,8 +279,7 @@ func (n *NginxPlus) queryHTTPCaches(ms *nginxMetrics) { } func (n *NginxPlus) queryStreamServerZones(ms *nginxMetrics) { - req, _ := web.NewHTTPRequest(n.Request.Copy()) - req.URL.Path = fmt.Sprintf(urlPathAPIStreamServerZones, n.apiVersion) + req, _ := web.NewHTTPRequestWithPath(n.Request, fmt.Sprintf(urlPathAPIStreamServerZones, n.apiVersion)) var v nginxStreamServerZones @@ -306,8 +293,7 @@ func (n *NginxPlus) queryStreamServerZones(ms *nginxMetrics) { } func (n *NginxPlus) queryStreamUpstreams(ms *nginxMetrics) { - req, _ := web.NewHTTPRequest(n.Request.Copy()) - req.URL.Path = fmt.Sprintf(urlPathAPIStreamUpstreams, n.apiVersion) + req, _ := web.NewHTTPRequestWithPath(n.Request, fmt.Sprintf(urlPathAPIStreamUpstreams, n.apiVersion)) var v nginxStreamUpstreams @@ -321,8 +307,7 @@ func (n *NginxPlus) queryStreamUpstreams(ms *nginxMetrics) { } func (n *NginxPlus) queryResolvers(ms *nginxMetrics) { - req, _ := web.NewHTTPRequest(n.Request.Copy()) - req.URL.Path = fmt.Sprintf(urlPathAPIResolvers, n.apiVersion) + req, _ := web.NewHTTPRequestWithPath(n.Request, fmt.Sprintf(urlPathAPIResolvers, n.apiVersion)) var v nginxResolvers diff --git a/src/go/plugin/go.d/modules/pihole/collect.go b/src/go/plugin/go.d/modules/pihole/collect.go index b38cc9ef71cebb..c9e6d8451e4482 100644 --- a/src/go/plugin/go.d/modules/pihole/collect.go +++ b/src/go/plugin/go.d/modules/pihole/collect.go @@ -131,13 +131,12 @@ func (p *Pihole) queryMetrics(pmx *piholeMetrics, doConcurrently bool) { } func (p *Pihole) querySummary(pmx *piholeMetrics) { - req, err := web.NewHTTPRequest(p.Request) + req, err := web.NewHTTPRequestWithPath(p.Request, urlPathAPI) if err != nil { p.Error(err) return } - req.URL.Path = urlPathAPI req.URL.RawQuery = url.Values{ urlQueryKeyAuth: []string{p.Password}, urlQueryKeySummaryRaw: []string{"true"}, @@ -153,13 +152,12 @@ func (p *Pihole) querySummary(pmx *piholeMetrics) { } func (p *Pihole) queryQueryTypes(pmx *piholeMetrics) { - req, err := web.NewHTTPRequest(p.Request) + req, err := web.NewHTTPRequestWithPath(p.Request, urlPathAPI) if err != nil { p.Error(err) return } - req.URL.Path = urlPathAPI req.URL.RawQuery = url.Values{ urlQueryKeyAuth: []string{p.Password}, urlQueryKeyGetQueryTypes: []string{"true"}, @@ -176,13 +174,12 @@ func (p *Pihole) queryQueryTypes(pmx *piholeMetrics) { } func (p *Pihole) queryForwardedDestinations(pmx *piholeMetrics) { - req, err := web.NewHTTPRequest(p.Request) + req, err := web.NewHTTPRequestWithPath(p.Request, urlPathAPI) if err != nil { p.Error(err) return } - req.URL.Path = urlPathAPI req.URL.RawQuery = url.Values{ urlQueryKeyAuth: []string{p.Password}, urlQueryKeyGetForwardDestinations: []string{"true"}, @@ -199,12 +196,11 @@ func (p *Pihole) queryForwardedDestinations(pmx *piholeMetrics) { } func (p *Pihole) queryAPIVersion() (int, error) { - req, err := web.NewHTTPRequest(p.Request) + req, err := web.NewHTTPRequestWithPath(p.Request, urlPathAPI) if err != nil { return 0, err } - req.URL.Path = urlPathAPI req.URL.RawQuery = url.Values{ urlQueryKeyAuth: []string{p.Password}, urlQueryKeyAPIVersion: []string{"true"}, diff --git a/src/go/plugin/go.d/modules/powerdns/collect.go b/src/go/plugin/go.d/modules/powerdns/collect.go index a1114ea708ed7b..c2831e0f2102b4 100644 --- a/src/go/plugin/go.d/modules/powerdns/collect.go +++ b/src/go/plugin/go.d/modules/powerdns/collect.go @@ -65,8 +65,7 @@ func (ns *AuthoritativeNS) collectStatistics(collected map[string]int64, statist } func (ns *AuthoritativeNS) scrapeStatistics() ([]statisticMetric, error) { - req, _ := web.NewHTTPRequest(ns.Request) - req.URL.Path = urlPathLocalStatistics + req, _ := web.NewHTTPRequestWithPath(ns.Request, urlPathLocalStatistics) var statistics statisticMetrics if err := ns.doOKDecode(req, &statistics); err != nil { diff --git a/src/go/plugin/go.d/modules/powerdns_recursor/collect.go b/src/go/plugin/go.d/modules/powerdns_recursor/collect.go index 0343d3d903d737..784093ccf95c72 100644 --- a/src/go/plugin/go.d/modules/powerdns_recursor/collect.go +++ b/src/go/plugin/go.d/modules/powerdns_recursor/collect.go @@ -65,8 +65,7 @@ func (r *Recursor) collectStatistics(collected map[string]int64, statistics stat } func (r *Recursor) scrapeStatistics() ([]statisticMetric, error) { - req, _ := web.NewHTTPRequest(r.Request) - req.URL.Path = urlPathLocalStatistics + req, _ := web.NewHTTPRequestWithPath(r.Request, urlPathLocalStatistics) var statistics statisticMetrics if err := r.doOKDecode(req, &statistics); err != nil { diff --git a/src/go/plugin/go.d/modules/puppet/collect.go b/src/go/plugin/go.d/modules/puppet/collect.go index 8e4e1fdb556f38..a1b95e09c79821 100644 --- a/src/go/plugin/go.d/modules/puppet/collect.go +++ b/src/go/plugin/go.d/modules/puppet/collect.go @@ -31,12 +31,11 @@ func (p *Puppet) collect() (map[string]int64, error) { } func (p *Puppet) queryStatsService() (*statusServiceResponse, error) { - req, err := web.NewHTTPRequest(p.Request) + req, err := web.NewHTTPRequestWithPath(p.Request, urlPathStatusService) if err != nil { return nil, err } - req.URL.Path = urlPathStatusService req.URL.RawQuery = urlQueryStatusService var stats statusServiceResponse diff --git a/src/go/plugin/go.d/modules/rabbitmq/collect.go b/src/go/plugin/go.d/modules/rabbitmq/collect.go index 4176e2fd98cf16..70b2aa03355b1a 100644 --- a/src/go/plugin/go.d/modules/rabbitmq/collect.go +++ b/src/go/plugin/go.d/modules/rabbitmq/collect.go @@ -145,13 +145,11 @@ func (r *RabbitMQ) collectQueuesStats(mx map[string]int64) error { } func (r *RabbitMQ) doOKDecode(urlPath string, in interface{}) error { - req, err := web.NewHTTPRequest(r.Request.Copy()) + req, err := web.NewHTTPRequestWithPath(r.Request, urlPath) if err != nil { return fmt.Errorf("error on creating request: %v", err) } - req.URL.Path = urlPath - r.Debugf("doing HTTP %s to '%s'", req.Method, req.URL) resp, err := r.httpClient.Do(req) if err != nil { diff --git a/src/go/plugin/go.d/modules/rspamd/collect.go b/src/go/plugin/go.d/modules/rspamd/collect.go index 5cdae9b2b18681..ecbe4a034a1234 100644 --- a/src/go/plugin/go.d/modules/rspamd/collect.go +++ b/src/go/plugin/go.d/modules/rspamd/collect.go @@ -50,13 +50,11 @@ func (r *Rspamd) collect() (map[string]int64, error) { } func (r *Rspamd) queryRspamdStats() (*rspamdStats, error) { - req, err := web.NewHTTPRequest(r.Request) + req, err := web.NewHTTPRequestWithPath(r.Request, "/stat") if err != nil { return nil, err } - req.URL.Path = "/stat" - var stats rspamdStats if err := r.doOKDecode(req, &stats); err != nil { return nil, err diff --git a/src/go/plugin/go.d/pkg/web/request.go b/src/go/plugin/go.d/pkg/web/request.go index a96281a8a4d3a2..20a6ec093d70e3 100644 --- a/src/go/plugin/go.d/pkg/web/request.go +++ b/src/go/plugin/go.d/pkg/web/request.go @@ -7,6 +7,7 @@ import ( "fmt" "io" "net/http" + "net/url" "strings" "github.com/netdata/netdata/go/plugins/pkg/buildinfo" @@ -90,3 +91,15 @@ func NewHTTPRequest(cfg Request) (*http.Request, error) { return req, nil } + +func NewHTTPRequestWithPath(cfg Request, urlPath string) (*http.Request, error) { + cfg = cfg.Copy() + + v, err := url.JoinPath(cfg.URL, urlPath) + if err != nil { + return nil, fmt.Errorf("failed to join URL path: %v", err) + } + cfg.URL = v + + return NewHTTPRequest(cfg) +} diff --git a/src/go/plugin/go.d/pkg/web/request_test.go b/src/go/plugin/go.d/pkg/web/request_test.go index 284cccb93ea76f..d39f9a36a13c89 100644 --- a/src/go/plugin/go.d/pkg/web/request_test.go +++ b/src/go/plugin/go.d/pkg/web/request_test.go @@ -159,6 +159,34 @@ func TestNewHTTPRequest(t *testing.T) { } } +func TestNewRequest(t *testing.T) { + tests := map[string]struct { + url string + path string + wantURL string + }{ + "base url": { + url: "http://127.0.0.1:65535", + path: "/bar", + wantURL: "http://127.0.0.1:65535/bar", + }, + "with path": { + url: "http://127.0.0.1:65535/foo/", + path: "/bar", + wantURL: "http://127.0.0.1:65535/foo/bar", + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + req, err := NewHTTPRequestWithPath(Request{URL: test.url}.Copy(), test.path) + require.NoError(t, err) + + assert.Equal(t, test.wantURL, req.URL.String()) + }) + } +} + func parseBasicAuth(auth string) (username, password string, ok bool) { const prefix = "Basic " if len(auth) < len(prefix) || !strings.EqualFold(auth[:len(prefix)], prefix) { From d6a4dc652a5bac3cf661631f644c190542ebc58b Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Sat, 13 Jul 2024 00:04:06 +0300 Subject: [PATCH 07/18] docs: go.d mysql: remove unix sockets from auto_detection (#18134) --- src/go/plugin/go.d/modules/mysql/metadata.yaml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/go/plugin/go.d/modules/mysql/metadata.yaml b/src/go/plugin/go.d/modules/mysql/metadata.yaml index 1bc1332389cfb3..6e0d1b6b7882dc 100644 --- a/src/go/plugin/go.d/modules/mysql/metadata.yaml +++ b/src/go/plugin/go.d/modules/mysql/metadata.yaml @@ -47,12 +47,8 @@ modules: default_behavior: auto_detection: description: | - By default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP and UNIX sockets: - - - /var/run/mysqld/mysqld.sock - - /var/run/mysqld/mysql.sock - - /var/lib/mysql/mysql.sock - - /tmp/mysql.sock + By default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP sockets: + - 127.0.0.1:3306 - "[::1]:3306" limits: From f63ee78508bfa0b9f12dc3bf9ab708bfca47040f Mon Sep 17 00:00:00 2001 From: Netdata bot <43409846+netdatabot@users.noreply.github.com> Date: Fri, 12 Jul 2024 17:12:25 -0400 Subject: [PATCH 08/18] Regenerate integrations.js (#18135) Co-authored-by: ilyam8 <22274335+ilyam8@users.noreply.github.com> --- integrations/integrations.js | 6 +++--- integrations/integrations.json | 6 +++--- src/go/plugin/go.d/modules/mysql/integrations/mariadb.md | 6 +----- src/go/plugin/go.d/modules/mysql/integrations/mysql.md | 6 +----- .../plugin/go.d/modules/mysql/integrations/percona_mysql.md | 6 +----- 5 files changed, 9 insertions(+), 21 deletions(-) diff --git a/integrations/integrations.js b/integrations/integrations.js index f1632a43b7b70b..85ca3b0207f95e 100644 --- a/integrations/integrations.js +++ b/integrations/integrations.js @@ -4992,7 +4992,7 @@ export const integrations = [ ], "most_popular": true }, - "overview": "# MariaDB\n\nPlugin: go.d.plugin\nModule: mysql\n\n## Overview\n\nThis collector monitors the health and performance of MySQL servers and collects general statistics, replication and user metrics.\n\n\nIt connects to the MySQL instance via a TCP or UNIX socket and executes the following commands:\n\nExecuted queries:\n\n- `SELECT VERSION();`\n- `SHOW GLOBAL STATUS;`\n- `SHOW GLOBAL VARIABLES;`\n- `SHOW SLAVE STATUS;` or `SHOW ALL SLAVES STATUS;` (MariaDBv10.2+) or `SHOW REPLICA STATUS;` (MySQL 8.0.22+)\n- `SHOW USER_STATISTICS;` (MariaDBv10.1.1+)\n- `SELECT TIME,USER FROM INFORMATION_SCHEMA.PROCESSLIST;`\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nBy default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP and UNIX sockets:\n\n- /var/run/mysqld/mysqld.sock\n- /var/run/mysqld/mysql.sock\n- /var/lib/mysql/mysql.sock\n- /tmp/mysql.sock\n- 127.0.0.1:3306\n- \"[::1]:3306\"\n\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", + "overview": "# MariaDB\n\nPlugin: go.d.plugin\nModule: mysql\n\n## Overview\n\nThis collector monitors the health and performance of MySQL servers and collects general statistics, replication and user metrics.\n\n\nIt connects to the MySQL instance via a TCP or UNIX socket and executes the following commands:\n\nExecuted queries:\n\n- `SELECT VERSION();`\n- `SHOW GLOBAL STATUS;`\n- `SHOW GLOBAL VARIABLES;`\n- `SHOW SLAVE STATUS;` or `SHOW ALL SLAVES STATUS;` (MariaDBv10.2+) or `SHOW REPLICA STATUS;` (MySQL 8.0.22+)\n- `SHOW USER_STATISTICS;` (MariaDBv10.1.1+)\n- `SELECT TIME,USER FROM INFORMATION_SCHEMA.PROCESSLIST;`\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nBy default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP sockets:\n\n- 127.0.0.1:3306\n- \"[::1]:3306\"\n\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", "setup": "## Setup\n\n### Prerequisites\n\n#### Create netdata user\n\nA user account should have the\nfollowing [permissions](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html):\n\n- [`USAGE`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_usage)\n- [`REPLICATION CLIENT`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_replication-client)\n- [`PROCESS`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_process)\n\nTo create the `netdata` user with these permissions, execute the following in the MySQL shell:\n\n```mysql\nCREATE USER 'netdata'@'localhost';\nGRANT USAGE, REPLICATION CLIENT, PROCESS ON *.* TO 'netdata'@'localhost';\nFLUSH PRIVILEGES;\n```\n\nThe `netdata` user will have the ability to connect to the MySQL server on localhost without a password. It will only\nbe able to gather statistics without being able to alter or affect operations in any way.\n\n\n\n### Configuration\n\n#### File\n\nThe configuration file name for this integration is `go.d/mysql.conf`.\n\n\nYou can edit the configuration file using the `edit-config` script from the\nNetdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).\n\n```bash\ncd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata\nsudo ./edit-config go.d/mysql.conf\n```\n#### Options\n\nThe following options can be defined globally: update_every, autodetection_retry.\n\n\n{% details open=true summary=\"Config options\" %}\n| Name | Description | Default | Required |\n|:----|:-----------|:-------|:--------:|\n| update_every | Data collection frequency. | 5 | no |\n| autodetection_retry | Recheck interval in seconds. Zero means no recheck will be scheduled. | 0 | no |\n| dsn | MySQL server DSN (Data Source Name). See [DSN syntax](https://github.com/go-sql-driver/mysql#dsn-data-source-name). | root@tcp(localhost:3306)/ | yes |\n| my.cnf | Specifies the my.cnf file to read the connection settings from the [client] section. | | no |\n| timeout | Query timeout in seconds. | 1 | no |\n\n{% /details %}\n#### Examples\n\n##### TCP socket\n\nAn example configuration.\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: local\n dsn: netdata@tcp(127.0.0.1:3306)/\n\n```\n{% /details %}\n##### Unix socket\n\nAn example configuration.\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: local\n dsn: netdata@unix(/var/lib/mysql/mysql.sock)/\n\n```\n{% /details %}\n##### Connection with password\n\nAn example configuration.\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: local\n dsn: netconfig:password@tcp(127.0.0.1:3306)/\n\n```\n{% /details %}\n##### my.cnf\n\nAn example configuration.\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: local\n my.cnf: '/etc/my.cnf'\n\n```\n{% /details %}\n##### Multi-instance\n\n> **Note**: When you define multiple jobs, their names must be unique.\n\nLocal and remote instances.\n\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: local\n dsn: netdata@tcp(127.0.0.1:3306)/\n\n - name: remote\n dsn: netconfig:password@tcp(203.0.113.0:3306)/\n\n```\n{% /details %}\n", "troubleshooting": "## Troubleshooting\n\n### Debug Mode\n\nTo troubleshoot issues with the `mysql` collector, run the `go.d.plugin` with the debug option enabled. The output\nshould give you clues as to why the collector isn't working.\n\n- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on\n your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`.\n\n ```bash\n cd /usr/libexec/netdata/plugins.d/\n ```\n\n- Switch to the `netdata` user.\n\n ```bash\n sudo -u netdata -s\n ```\n\n- Run the `go.d.plugin` to debug the collector:\n\n ```bash\n ./go.d.plugin -d -m mysql\n ```\n\n### Getting Logs\n\nIf you're encountering problems with the `mysql` collector, follow these steps to retrieve logs and identify potential issues:\n\n- **Run the command** specific to your system (systemd, non-systemd, or Docker container).\n- **Examine the output** for any warnings or error messages that might indicate issues. These messages should provide clues about the root cause of the problem.\n\n#### System with systemd\n\nUse the following command to view logs generated since the last Netdata service restart:\n\n```bash\njournalctl _SYSTEMD_INVOCATION_ID=\"$(systemctl show --value --property=InvocationID netdata)\" --namespace=netdata --grep mysql\n```\n\n#### System without systemd\n\nLocate the collector log file, typically at `/var/log/netdata/collector.log`, and use `grep` to filter for collector's name:\n\n```bash\ngrep mysql /var/log/netdata/collector.log\n```\n\n**Note**: This method shows logs from all restarts. Focus on the **latest entries** for troubleshooting current issues.\n\n#### Docker Container\n\nIf your Netdata runs in a Docker container named \"netdata\" (replace if different), use this command:\n\n```bash\ndocker logs netdata 2>&1 | grep mysql\n```\n\n", "alerts": "## Alerts\n\n\nThe following alerts are available:\n\n| Alert name | On metric | Description |\n|:------------|:----------|:------------|\n| [ mysql_10s_slow_queries ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.queries | number of slow queries in the last 10 seconds |\n| [ mysql_10s_table_locks_immediate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | number of table immediate locks in the last 10 seconds |\n| [ mysql_10s_table_locks_waited ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | number of table waited locks in the last 10 seconds |\n| [ mysql_10s_waited_locks_ratio ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | ratio of waited table locks over the last 10 seconds |\n| [ mysql_connections ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.connections_active | client connections utilization |\n| [ mysql_replication ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.slave_status | replication status (0: stopped, 1: working) |\n| [ mysql_replication_lag ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.slave_behind | difference between the timestamp of the latest transaction processed by the SQL thread and the timestamp of the same transaction when it was processed on the master |\n| [ mysql_galera_cluster_size_max_2m ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_size | maximum galera cluster size in the last 2 minutes starting one minute ago |\n| [ mysql_galera_cluster_size ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_size | current galera cluster size, compared to the maximum size in the last 2 minutes |\n| [ mysql_galera_cluster_state_warn ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_state | galera node state is either Donor/Desynced or Joined |\n| [ mysql_galera_cluster_state_crit ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_state | galera node state is either Undefined or Joining or Error |\n| [ mysql_galera_cluster_status ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_status | galera node is part of a nonoperational component. This occurs in cases of multiple membership changes that result in a loss of Quorum or in cases of split-brain situations. |\n", @@ -5042,7 +5042,7 @@ export const integrations = [ ], "most_popular": true }, - "overview": "# MySQL\n\nPlugin: go.d.plugin\nModule: mysql\n\n## Overview\n\nThis collector monitors the health and performance of MySQL servers and collects general statistics, replication and user metrics.\n\n\nIt connects to the MySQL instance via a TCP or UNIX socket and executes the following commands:\n\nExecuted queries:\n\n- `SELECT VERSION();`\n- `SHOW GLOBAL STATUS;`\n- `SHOW GLOBAL VARIABLES;`\n- `SHOW SLAVE STATUS;` or `SHOW ALL SLAVES STATUS;` (MariaDBv10.2+) or `SHOW REPLICA STATUS;` (MySQL 8.0.22+)\n- `SHOW USER_STATISTICS;` (MariaDBv10.1.1+)\n- `SELECT TIME,USER FROM INFORMATION_SCHEMA.PROCESSLIST;`\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nBy default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP and UNIX sockets:\n\n- /var/run/mysqld/mysqld.sock\n- /var/run/mysqld/mysql.sock\n- /var/lib/mysql/mysql.sock\n- /tmp/mysql.sock\n- 127.0.0.1:3306\n- \"[::1]:3306\"\n\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", + "overview": "# MySQL\n\nPlugin: go.d.plugin\nModule: mysql\n\n## Overview\n\nThis collector monitors the health and performance of MySQL servers and collects general statistics, replication and user metrics.\n\n\nIt connects to the MySQL instance via a TCP or UNIX socket and executes the following commands:\n\nExecuted queries:\n\n- `SELECT VERSION();`\n- `SHOW GLOBAL STATUS;`\n- `SHOW GLOBAL VARIABLES;`\n- `SHOW SLAVE STATUS;` or `SHOW ALL SLAVES STATUS;` (MariaDBv10.2+) or `SHOW REPLICA STATUS;` (MySQL 8.0.22+)\n- `SHOW USER_STATISTICS;` (MariaDBv10.1.1+)\n- `SELECT TIME,USER FROM INFORMATION_SCHEMA.PROCESSLIST;`\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nBy default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP sockets:\n\n- 127.0.0.1:3306\n- \"[::1]:3306\"\n\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", "setup": "## Setup\n\n### Prerequisites\n\n#### Create netdata user\n\nA user account should have the\nfollowing [permissions](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html):\n\n- [`USAGE`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_usage)\n- [`REPLICATION CLIENT`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_replication-client)\n- [`PROCESS`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_process)\n\nTo create the `netdata` user with these permissions, execute the following in the MySQL shell:\n\n```mysql\nCREATE USER 'netdata'@'localhost';\nGRANT USAGE, REPLICATION CLIENT, PROCESS ON *.* TO 'netdata'@'localhost';\nFLUSH PRIVILEGES;\n```\n\nThe `netdata` user will have the ability to connect to the MySQL server on localhost without a password. It will only\nbe able to gather statistics without being able to alter or affect operations in any way.\n\n\n\n### Configuration\n\n#### File\n\nThe configuration file name for this integration is `go.d/mysql.conf`.\n\n\nYou can edit the configuration file using the `edit-config` script from the\nNetdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).\n\n```bash\ncd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata\nsudo ./edit-config go.d/mysql.conf\n```\n#### Options\n\nThe following options can be defined globally: update_every, autodetection_retry.\n\n\n{% details open=true summary=\"Config options\" %}\n| Name | Description | Default | Required |\n|:----|:-----------|:-------|:--------:|\n| update_every | Data collection frequency. | 5 | no |\n| autodetection_retry | Recheck interval in seconds. Zero means no recheck will be scheduled. | 0 | no |\n| dsn | MySQL server DSN (Data Source Name). See [DSN syntax](https://github.com/go-sql-driver/mysql#dsn-data-source-name). | root@tcp(localhost:3306)/ | yes |\n| my.cnf | Specifies the my.cnf file to read the connection settings from the [client] section. | | no |\n| timeout | Query timeout in seconds. | 1 | no |\n\n{% /details %}\n#### Examples\n\n##### TCP socket\n\nAn example configuration.\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: local\n dsn: netdata@tcp(127.0.0.1:3306)/\n\n```\n{% /details %}\n##### Unix socket\n\nAn example configuration.\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: local\n dsn: netdata@unix(/var/lib/mysql/mysql.sock)/\n\n```\n{% /details %}\n##### Connection with password\n\nAn example configuration.\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: local\n dsn: netconfig:password@tcp(127.0.0.1:3306)/\n\n```\n{% /details %}\n##### my.cnf\n\nAn example configuration.\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: local\n my.cnf: '/etc/my.cnf'\n\n```\n{% /details %}\n##### Multi-instance\n\n> **Note**: When you define multiple jobs, their names must be unique.\n\nLocal and remote instances.\n\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: local\n dsn: netdata@tcp(127.0.0.1:3306)/\n\n - name: remote\n dsn: netconfig:password@tcp(203.0.113.0:3306)/\n\n```\n{% /details %}\n", "troubleshooting": "## Troubleshooting\n\n### Debug Mode\n\nTo troubleshoot issues with the `mysql` collector, run the `go.d.plugin` with the debug option enabled. The output\nshould give you clues as to why the collector isn't working.\n\n- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on\n your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`.\n\n ```bash\n cd /usr/libexec/netdata/plugins.d/\n ```\n\n- Switch to the `netdata` user.\n\n ```bash\n sudo -u netdata -s\n ```\n\n- Run the `go.d.plugin` to debug the collector:\n\n ```bash\n ./go.d.plugin -d -m mysql\n ```\n\n### Getting Logs\n\nIf you're encountering problems with the `mysql` collector, follow these steps to retrieve logs and identify potential issues:\n\n- **Run the command** specific to your system (systemd, non-systemd, or Docker container).\n- **Examine the output** for any warnings or error messages that might indicate issues. These messages should provide clues about the root cause of the problem.\n\n#### System with systemd\n\nUse the following command to view logs generated since the last Netdata service restart:\n\n```bash\njournalctl _SYSTEMD_INVOCATION_ID=\"$(systemctl show --value --property=InvocationID netdata)\" --namespace=netdata --grep mysql\n```\n\n#### System without systemd\n\nLocate the collector log file, typically at `/var/log/netdata/collector.log`, and use `grep` to filter for collector's name:\n\n```bash\ngrep mysql /var/log/netdata/collector.log\n```\n\n**Note**: This method shows logs from all restarts. Focus on the **latest entries** for troubleshooting current issues.\n\n#### Docker Container\n\nIf your Netdata runs in a Docker container named \"netdata\" (replace if different), use this command:\n\n```bash\ndocker logs netdata 2>&1 | grep mysql\n```\n\n", "alerts": "## Alerts\n\n\nThe following alerts are available:\n\n| Alert name | On metric | Description |\n|:------------|:----------|:------------|\n| [ mysql_10s_slow_queries ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.queries | number of slow queries in the last 10 seconds |\n| [ mysql_10s_table_locks_immediate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | number of table immediate locks in the last 10 seconds |\n| [ mysql_10s_table_locks_waited ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | number of table waited locks in the last 10 seconds |\n| [ mysql_10s_waited_locks_ratio ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | ratio of waited table locks over the last 10 seconds |\n| [ mysql_connections ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.connections_active | client connections utilization |\n| [ mysql_replication ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.slave_status | replication status (0: stopped, 1: working) |\n| [ mysql_replication_lag ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.slave_behind | difference between the timestamp of the latest transaction processed by the SQL thread and the timestamp of the same transaction when it was processed on the master |\n| [ mysql_galera_cluster_size_max_2m ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_size | maximum galera cluster size in the last 2 minutes starting one minute ago |\n| [ mysql_galera_cluster_size ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_size | current galera cluster size, compared to the maximum size in the last 2 minutes |\n| [ mysql_galera_cluster_state_warn ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_state | galera node state is either Donor/Desynced or Joined |\n| [ mysql_galera_cluster_state_crit ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_state | galera node state is either Undefined or Joining or Error |\n| [ mysql_galera_cluster_status ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_status | galera node is part of a nonoperational component. This occurs in cases of multiple membership changes that result in a loss of Quorum or in cases of split-brain situations. |\n", @@ -5092,7 +5092,7 @@ export const integrations = [ ], "most_popular": false }, - "overview": "# Percona MySQL\n\nPlugin: go.d.plugin\nModule: mysql\n\n## Overview\n\nThis collector monitors the health and performance of MySQL servers and collects general statistics, replication and user metrics.\n\n\nIt connects to the MySQL instance via a TCP or UNIX socket and executes the following commands:\n\nExecuted queries:\n\n- `SELECT VERSION();`\n- `SHOW GLOBAL STATUS;`\n- `SHOW GLOBAL VARIABLES;`\n- `SHOW SLAVE STATUS;` or `SHOW ALL SLAVES STATUS;` (MariaDBv10.2+) or `SHOW REPLICA STATUS;` (MySQL 8.0.22+)\n- `SHOW USER_STATISTICS;` (MariaDBv10.1.1+)\n- `SELECT TIME,USER FROM INFORMATION_SCHEMA.PROCESSLIST;`\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nBy default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP and UNIX sockets:\n\n- /var/run/mysqld/mysqld.sock\n- /var/run/mysqld/mysql.sock\n- /var/lib/mysql/mysql.sock\n- /tmp/mysql.sock\n- 127.0.0.1:3306\n- \"[::1]:3306\"\n\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", + "overview": "# Percona MySQL\n\nPlugin: go.d.plugin\nModule: mysql\n\n## Overview\n\nThis collector monitors the health and performance of MySQL servers and collects general statistics, replication and user metrics.\n\n\nIt connects to the MySQL instance via a TCP or UNIX socket and executes the following commands:\n\nExecuted queries:\n\n- `SELECT VERSION();`\n- `SHOW GLOBAL STATUS;`\n- `SHOW GLOBAL VARIABLES;`\n- `SHOW SLAVE STATUS;` or `SHOW ALL SLAVES STATUS;` (MariaDBv10.2+) or `SHOW REPLICA STATUS;` (MySQL 8.0.22+)\n- `SHOW USER_STATISTICS;` (MariaDBv10.1.1+)\n- `SELECT TIME,USER FROM INFORMATION_SCHEMA.PROCESSLIST;`\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nBy default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP sockets:\n\n- 127.0.0.1:3306\n- \"[::1]:3306\"\n\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", "setup": "## Setup\n\n### Prerequisites\n\n#### Create netdata user\n\nA user account should have the\nfollowing [permissions](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html):\n\n- [`USAGE`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_usage)\n- [`REPLICATION CLIENT`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_replication-client)\n- [`PROCESS`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_process)\n\nTo create the `netdata` user with these permissions, execute the following in the MySQL shell:\n\n```mysql\nCREATE USER 'netdata'@'localhost';\nGRANT USAGE, REPLICATION CLIENT, PROCESS ON *.* TO 'netdata'@'localhost';\nFLUSH PRIVILEGES;\n```\n\nThe `netdata` user will have the ability to connect to the MySQL server on localhost without a password. It will only\nbe able to gather statistics without being able to alter or affect operations in any way.\n\n\n\n### Configuration\n\n#### File\n\nThe configuration file name for this integration is `go.d/mysql.conf`.\n\n\nYou can edit the configuration file using the `edit-config` script from the\nNetdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).\n\n```bash\ncd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata\nsudo ./edit-config go.d/mysql.conf\n```\n#### Options\n\nThe following options can be defined globally: update_every, autodetection_retry.\n\n\n{% details open=true summary=\"Config options\" %}\n| Name | Description | Default | Required |\n|:----|:-----------|:-------|:--------:|\n| update_every | Data collection frequency. | 5 | no |\n| autodetection_retry | Recheck interval in seconds. Zero means no recheck will be scheduled. | 0 | no |\n| dsn | MySQL server DSN (Data Source Name). See [DSN syntax](https://github.com/go-sql-driver/mysql#dsn-data-source-name). | root@tcp(localhost:3306)/ | yes |\n| my.cnf | Specifies the my.cnf file to read the connection settings from the [client] section. | | no |\n| timeout | Query timeout in seconds. | 1 | no |\n\n{% /details %}\n#### Examples\n\n##### TCP socket\n\nAn example configuration.\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: local\n dsn: netdata@tcp(127.0.0.1:3306)/\n\n```\n{% /details %}\n##### Unix socket\n\nAn example configuration.\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: local\n dsn: netdata@unix(/var/lib/mysql/mysql.sock)/\n\n```\n{% /details %}\n##### Connection with password\n\nAn example configuration.\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: local\n dsn: netconfig:password@tcp(127.0.0.1:3306)/\n\n```\n{% /details %}\n##### my.cnf\n\nAn example configuration.\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: local\n my.cnf: '/etc/my.cnf'\n\n```\n{% /details %}\n##### Multi-instance\n\n> **Note**: When you define multiple jobs, their names must be unique.\n\nLocal and remote instances.\n\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: local\n dsn: netdata@tcp(127.0.0.1:3306)/\n\n - name: remote\n dsn: netconfig:password@tcp(203.0.113.0:3306)/\n\n```\n{% /details %}\n", "troubleshooting": "## Troubleshooting\n\n### Debug Mode\n\nTo troubleshoot issues with the `mysql` collector, run the `go.d.plugin` with the debug option enabled. The output\nshould give you clues as to why the collector isn't working.\n\n- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on\n your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`.\n\n ```bash\n cd /usr/libexec/netdata/plugins.d/\n ```\n\n- Switch to the `netdata` user.\n\n ```bash\n sudo -u netdata -s\n ```\n\n- Run the `go.d.plugin` to debug the collector:\n\n ```bash\n ./go.d.plugin -d -m mysql\n ```\n\n### Getting Logs\n\nIf you're encountering problems with the `mysql` collector, follow these steps to retrieve logs and identify potential issues:\n\n- **Run the command** specific to your system (systemd, non-systemd, or Docker container).\n- **Examine the output** for any warnings or error messages that might indicate issues. These messages should provide clues about the root cause of the problem.\n\n#### System with systemd\n\nUse the following command to view logs generated since the last Netdata service restart:\n\n```bash\njournalctl _SYSTEMD_INVOCATION_ID=\"$(systemctl show --value --property=InvocationID netdata)\" --namespace=netdata --grep mysql\n```\n\n#### System without systemd\n\nLocate the collector log file, typically at `/var/log/netdata/collector.log`, and use `grep` to filter for collector's name:\n\n```bash\ngrep mysql /var/log/netdata/collector.log\n```\n\n**Note**: This method shows logs from all restarts. Focus on the **latest entries** for troubleshooting current issues.\n\n#### Docker Container\n\nIf your Netdata runs in a Docker container named \"netdata\" (replace if different), use this command:\n\n```bash\ndocker logs netdata 2>&1 | grep mysql\n```\n\n", "alerts": "## Alerts\n\n\nThe following alerts are available:\n\n| Alert name | On metric | Description |\n|:------------|:----------|:------------|\n| [ mysql_10s_slow_queries ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.queries | number of slow queries in the last 10 seconds |\n| [ mysql_10s_table_locks_immediate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | number of table immediate locks in the last 10 seconds |\n| [ mysql_10s_table_locks_waited ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | number of table waited locks in the last 10 seconds |\n| [ mysql_10s_waited_locks_ratio ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | ratio of waited table locks over the last 10 seconds |\n| [ mysql_connections ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.connections_active | client connections utilization |\n| [ mysql_replication ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.slave_status | replication status (0: stopped, 1: working) |\n| [ mysql_replication_lag ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.slave_behind | difference between the timestamp of the latest transaction processed by the SQL thread and the timestamp of the same transaction when it was processed on the master |\n| [ mysql_galera_cluster_size_max_2m ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_size | maximum galera cluster size in the last 2 minutes starting one minute ago |\n| [ mysql_galera_cluster_size ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_size | current galera cluster size, compared to the maximum size in the last 2 minutes |\n| [ mysql_galera_cluster_state_warn ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_state | galera node state is either Donor/Desynced or Joined |\n| [ mysql_galera_cluster_state_crit ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_state | galera node state is either Undefined or Joining or Error |\n| [ mysql_galera_cluster_status ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_status | galera node is part of a nonoperational component. This occurs in cases of multiple membership changes that result in a loss of Quorum or in cases of split-brain situations. |\n", diff --git a/integrations/integrations.json b/integrations/integrations.json index 9b677b10c2a402..1272e22e927a9c 100644 --- a/integrations/integrations.json +++ b/integrations/integrations.json @@ -4990,7 +4990,7 @@ ], "most_popular": true }, - "overview": "# MariaDB\n\nPlugin: go.d.plugin\nModule: mysql\n\n## Overview\n\nThis collector monitors the health and performance of MySQL servers and collects general statistics, replication and user metrics.\n\n\nIt connects to the MySQL instance via a TCP or UNIX socket and executes the following commands:\n\nExecuted queries:\n\n- `SELECT VERSION();`\n- `SHOW GLOBAL STATUS;`\n- `SHOW GLOBAL VARIABLES;`\n- `SHOW SLAVE STATUS;` or `SHOW ALL SLAVES STATUS;` (MariaDBv10.2+) or `SHOW REPLICA STATUS;` (MySQL 8.0.22+)\n- `SHOW USER_STATISTICS;` (MariaDBv10.1.1+)\n- `SELECT TIME,USER FROM INFORMATION_SCHEMA.PROCESSLIST;`\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nBy default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP and UNIX sockets:\n\n- /var/run/mysqld/mysqld.sock\n- /var/run/mysqld/mysql.sock\n- /var/lib/mysql/mysql.sock\n- /tmp/mysql.sock\n- 127.0.0.1:3306\n- \"[::1]:3306\"\n\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", + "overview": "# MariaDB\n\nPlugin: go.d.plugin\nModule: mysql\n\n## Overview\n\nThis collector monitors the health and performance of MySQL servers and collects general statistics, replication and user metrics.\n\n\nIt connects to the MySQL instance via a TCP or UNIX socket and executes the following commands:\n\nExecuted queries:\n\n- `SELECT VERSION();`\n- `SHOW GLOBAL STATUS;`\n- `SHOW GLOBAL VARIABLES;`\n- `SHOW SLAVE STATUS;` or `SHOW ALL SLAVES STATUS;` (MariaDBv10.2+) or `SHOW REPLICA STATUS;` (MySQL 8.0.22+)\n- `SHOW USER_STATISTICS;` (MariaDBv10.1.1+)\n- `SELECT TIME,USER FROM INFORMATION_SCHEMA.PROCESSLIST;`\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nBy default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP sockets:\n\n- 127.0.0.1:3306\n- \"[::1]:3306\"\n\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", "setup": "## Setup\n\n### Prerequisites\n\n#### Create netdata user\n\nA user account should have the\nfollowing [permissions](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html):\n\n- [`USAGE`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_usage)\n- [`REPLICATION CLIENT`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_replication-client)\n- [`PROCESS`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_process)\n\nTo create the `netdata` user with these permissions, execute the following in the MySQL shell:\n\n```mysql\nCREATE USER 'netdata'@'localhost';\nGRANT USAGE, REPLICATION CLIENT, PROCESS ON *.* TO 'netdata'@'localhost';\nFLUSH PRIVILEGES;\n```\n\nThe `netdata` user will have the ability to connect to the MySQL server on localhost without a password. It will only\nbe able to gather statistics without being able to alter or affect operations in any way.\n\n\n\n### Configuration\n\n#### File\n\nThe configuration file name for this integration is `go.d/mysql.conf`.\n\n\nYou can edit the configuration file using the `edit-config` script from the\nNetdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).\n\n```bash\ncd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata\nsudo ./edit-config go.d/mysql.conf\n```\n#### Options\n\nThe following options can be defined globally: update_every, autodetection_retry.\n\n\n| Name | Description | Default | Required |\n|:----|:-----------|:-------|:--------:|\n| update_every | Data collection frequency. | 5 | no |\n| autodetection_retry | Recheck interval in seconds. Zero means no recheck will be scheduled. | 0 | no |\n| dsn | MySQL server DSN (Data Source Name). See [DSN syntax](https://github.com/go-sql-driver/mysql#dsn-data-source-name). | root@tcp(localhost:3306)/ | yes |\n| my.cnf | Specifies the my.cnf file to read the connection settings from the [client] section. | | no |\n| timeout | Query timeout in seconds. | 1 | no |\n\n#### Examples\n\n##### TCP socket\n\nAn example configuration.\n\n```yaml\njobs:\n - name: local\n dsn: netdata@tcp(127.0.0.1:3306)/\n\n```\n##### Unix socket\n\nAn example configuration.\n\n```yaml\njobs:\n - name: local\n dsn: netdata@unix(/var/lib/mysql/mysql.sock)/\n\n```\n##### Connection with password\n\nAn example configuration.\n\n```yaml\njobs:\n - name: local\n dsn: netconfig:password@tcp(127.0.0.1:3306)/\n\n```\n##### my.cnf\n\nAn example configuration.\n\n```yaml\njobs:\n - name: local\n my.cnf: '/etc/my.cnf'\n\n```\n##### Multi-instance\n\n> **Note**: When you define multiple jobs, their names must be unique.\n\nLocal and remote instances.\n\n\n```yaml\njobs:\n - name: local\n dsn: netdata@tcp(127.0.0.1:3306)/\n\n - name: remote\n dsn: netconfig:password@tcp(203.0.113.0:3306)/\n\n```\n", "troubleshooting": "## Troubleshooting\n\n### Debug Mode\n\nTo troubleshoot issues with the `mysql` collector, run the `go.d.plugin` with the debug option enabled. The output\nshould give you clues as to why the collector isn't working.\n\n- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on\n your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`.\n\n ```bash\n cd /usr/libexec/netdata/plugins.d/\n ```\n\n- Switch to the `netdata` user.\n\n ```bash\n sudo -u netdata -s\n ```\n\n- Run the `go.d.plugin` to debug the collector:\n\n ```bash\n ./go.d.plugin -d -m mysql\n ```\n\n### Getting Logs\n\nIf you're encountering problems with the `mysql` collector, follow these steps to retrieve logs and identify potential issues:\n\n- **Run the command** specific to your system (systemd, non-systemd, or Docker container).\n- **Examine the output** for any warnings or error messages that might indicate issues. These messages should provide clues about the root cause of the problem.\n\n#### System with systemd\n\nUse the following command to view logs generated since the last Netdata service restart:\n\n```bash\njournalctl _SYSTEMD_INVOCATION_ID=\"$(systemctl show --value --property=InvocationID netdata)\" --namespace=netdata --grep mysql\n```\n\n#### System without systemd\n\nLocate the collector log file, typically at `/var/log/netdata/collector.log`, and use `grep` to filter for collector's name:\n\n```bash\ngrep mysql /var/log/netdata/collector.log\n```\n\n**Note**: This method shows logs from all restarts. Focus on the **latest entries** for troubleshooting current issues.\n\n#### Docker Container\n\nIf your Netdata runs in a Docker container named \"netdata\" (replace if different), use this command:\n\n```bash\ndocker logs netdata 2>&1 | grep mysql\n```\n\n", "alerts": "## Alerts\n\n\nThe following alerts are available:\n\n| Alert name | On metric | Description |\n|:------------|:----------|:------------|\n| [ mysql_10s_slow_queries ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.queries | number of slow queries in the last 10 seconds |\n| [ mysql_10s_table_locks_immediate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | number of table immediate locks in the last 10 seconds |\n| [ mysql_10s_table_locks_waited ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | number of table waited locks in the last 10 seconds |\n| [ mysql_10s_waited_locks_ratio ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | ratio of waited table locks over the last 10 seconds |\n| [ mysql_connections ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.connections_active | client connections utilization |\n| [ mysql_replication ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.slave_status | replication status (0: stopped, 1: working) |\n| [ mysql_replication_lag ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.slave_behind | difference between the timestamp of the latest transaction processed by the SQL thread and the timestamp of the same transaction when it was processed on the master |\n| [ mysql_galera_cluster_size_max_2m ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_size | maximum galera cluster size in the last 2 minutes starting one minute ago |\n| [ mysql_galera_cluster_size ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_size | current galera cluster size, compared to the maximum size in the last 2 minutes |\n| [ mysql_galera_cluster_state_warn ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_state | galera node state is either Donor/Desynced or Joined |\n| [ mysql_galera_cluster_state_crit ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_state | galera node state is either Undefined or Joining or Error |\n| [ mysql_galera_cluster_status ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_status | galera node is part of a nonoperational component. This occurs in cases of multiple membership changes that result in a loss of Quorum or in cases of split-brain situations. |\n", @@ -5040,7 +5040,7 @@ ], "most_popular": true }, - "overview": "# MySQL\n\nPlugin: go.d.plugin\nModule: mysql\n\n## Overview\n\nThis collector monitors the health and performance of MySQL servers and collects general statistics, replication and user metrics.\n\n\nIt connects to the MySQL instance via a TCP or UNIX socket and executes the following commands:\n\nExecuted queries:\n\n- `SELECT VERSION();`\n- `SHOW GLOBAL STATUS;`\n- `SHOW GLOBAL VARIABLES;`\n- `SHOW SLAVE STATUS;` or `SHOW ALL SLAVES STATUS;` (MariaDBv10.2+) or `SHOW REPLICA STATUS;` (MySQL 8.0.22+)\n- `SHOW USER_STATISTICS;` (MariaDBv10.1.1+)\n- `SELECT TIME,USER FROM INFORMATION_SCHEMA.PROCESSLIST;`\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nBy default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP and UNIX sockets:\n\n- /var/run/mysqld/mysqld.sock\n- /var/run/mysqld/mysql.sock\n- /var/lib/mysql/mysql.sock\n- /tmp/mysql.sock\n- 127.0.0.1:3306\n- \"[::1]:3306\"\n\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", + "overview": "# MySQL\n\nPlugin: go.d.plugin\nModule: mysql\n\n## Overview\n\nThis collector monitors the health and performance of MySQL servers and collects general statistics, replication and user metrics.\n\n\nIt connects to the MySQL instance via a TCP or UNIX socket and executes the following commands:\n\nExecuted queries:\n\n- `SELECT VERSION();`\n- `SHOW GLOBAL STATUS;`\n- `SHOW GLOBAL VARIABLES;`\n- `SHOW SLAVE STATUS;` or `SHOW ALL SLAVES STATUS;` (MariaDBv10.2+) or `SHOW REPLICA STATUS;` (MySQL 8.0.22+)\n- `SHOW USER_STATISTICS;` (MariaDBv10.1.1+)\n- `SELECT TIME,USER FROM INFORMATION_SCHEMA.PROCESSLIST;`\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nBy default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP sockets:\n\n- 127.0.0.1:3306\n- \"[::1]:3306\"\n\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", "setup": "## Setup\n\n### Prerequisites\n\n#### Create netdata user\n\nA user account should have the\nfollowing [permissions](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html):\n\n- [`USAGE`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_usage)\n- [`REPLICATION CLIENT`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_replication-client)\n- [`PROCESS`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_process)\n\nTo create the `netdata` user with these permissions, execute the following in the MySQL shell:\n\n```mysql\nCREATE USER 'netdata'@'localhost';\nGRANT USAGE, REPLICATION CLIENT, PROCESS ON *.* TO 'netdata'@'localhost';\nFLUSH PRIVILEGES;\n```\n\nThe `netdata` user will have the ability to connect to the MySQL server on localhost without a password. It will only\nbe able to gather statistics without being able to alter or affect operations in any way.\n\n\n\n### Configuration\n\n#### File\n\nThe configuration file name for this integration is `go.d/mysql.conf`.\n\n\nYou can edit the configuration file using the `edit-config` script from the\nNetdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).\n\n```bash\ncd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata\nsudo ./edit-config go.d/mysql.conf\n```\n#### Options\n\nThe following options can be defined globally: update_every, autodetection_retry.\n\n\n| Name | Description | Default | Required |\n|:----|:-----------|:-------|:--------:|\n| update_every | Data collection frequency. | 5 | no |\n| autodetection_retry | Recheck interval in seconds. Zero means no recheck will be scheduled. | 0 | no |\n| dsn | MySQL server DSN (Data Source Name). See [DSN syntax](https://github.com/go-sql-driver/mysql#dsn-data-source-name). | root@tcp(localhost:3306)/ | yes |\n| my.cnf | Specifies the my.cnf file to read the connection settings from the [client] section. | | no |\n| timeout | Query timeout in seconds. | 1 | no |\n\n#### Examples\n\n##### TCP socket\n\nAn example configuration.\n\n```yaml\njobs:\n - name: local\n dsn: netdata@tcp(127.0.0.1:3306)/\n\n```\n##### Unix socket\n\nAn example configuration.\n\n```yaml\njobs:\n - name: local\n dsn: netdata@unix(/var/lib/mysql/mysql.sock)/\n\n```\n##### Connection with password\n\nAn example configuration.\n\n```yaml\njobs:\n - name: local\n dsn: netconfig:password@tcp(127.0.0.1:3306)/\n\n```\n##### my.cnf\n\nAn example configuration.\n\n```yaml\njobs:\n - name: local\n my.cnf: '/etc/my.cnf'\n\n```\n##### Multi-instance\n\n> **Note**: When you define multiple jobs, their names must be unique.\n\nLocal and remote instances.\n\n\n```yaml\njobs:\n - name: local\n dsn: netdata@tcp(127.0.0.1:3306)/\n\n - name: remote\n dsn: netconfig:password@tcp(203.0.113.0:3306)/\n\n```\n", "troubleshooting": "## Troubleshooting\n\n### Debug Mode\n\nTo troubleshoot issues with the `mysql` collector, run the `go.d.plugin` with the debug option enabled. The output\nshould give you clues as to why the collector isn't working.\n\n- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on\n your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`.\n\n ```bash\n cd /usr/libexec/netdata/plugins.d/\n ```\n\n- Switch to the `netdata` user.\n\n ```bash\n sudo -u netdata -s\n ```\n\n- Run the `go.d.plugin` to debug the collector:\n\n ```bash\n ./go.d.plugin -d -m mysql\n ```\n\n### Getting Logs\n\nIf you're encountering problems with the `mysql` collector, follow these steps to retrieve logs and identify potential issues:\n\n- **Run the command** specific to your system (systemd, non-systemd, or Docker container).\n- **Examine the output** for any warnings or error messages that might indicate issues. These messages should provide clues about the root cause of the problem.\n\n#### System with systemd\n\nUse the following command to view logs generated since the last Netdata service restart:\n\n```bash\njournalctl _SYSTEMD_INVOCATION_ID=\"$(systemctl show --value --property=InvocationID netdata)\" --namespace=netdata --grep mysql\n```\n\n#### System without systemd\n\nLocate the collector log file, typically at `/var/log/netdata/collector.log`, and use `grep` to filter for collector's name:\n\n```bash\ngrep mysql /var/log/netdata/collector.log\n```\n\n**Note**: This method shows logs from all restarts. Focus on the **latest entries** for troubleshooting current issues.\n\n#### Docker Container\n\nIf your Netdata runs in a Docker container named \"netdata\" (replace if different), use this command:\n\n```bash\ndocker logs netdata 2>&1 | grep mysql\n```\n\n", "alerts": "## Alerts\n\n\nThe following alerts are available:\n\n| Alert name | On metric | Description |\n|:------------|:----------|:------------|\n| [ mysql_10s_slow_queries ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.queries | number of slow queries in the last 10 seconds |\n| [ mysql_10s_table_locks_immediate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | number of table immediate locks in the last 10 seconds |\n| [ mysql_10s_table_locks_waited ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | number of table waited locks in the last 10 seconds |\n| [ mysql_10s_waited_locks_ratio ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | ratio of waited table locks over the last 10 seconds |\n| [ mysql_connections ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.connections_active | client connections utilization |\n| [ mysql_replication ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.slave_status | replication status (0: stopped, 1: working) |\n| [ mysql_replication_lag ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.slave_behind | difference between the timestamp of the latest transaction processed by the SQL thread and the timestamp of the same transaction when it was processed on the master |\n| [ mysql_galera_cluster_size_max_2m ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_size | maximum galera cluster size in the last 2 minutes starting one minute ago |\n| [ mysql_galera_cluster_size ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_size | current galera cluster size, compared to the maximum size in the last 2 minutes |\n| [ mysql_galera_cluster_state_warn ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_state | galera node state is either Donor/Desynced or Joined |\n| [ mysql_galera_cluster_state_crit ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_state | galera node state is either Undefined or Joining or Error |\n| [ mysql_galera_cluster_status ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_status | galera node is part of a nonoperational component. This occurs in cases of multiple membership changes that result in a loss of Quorum or in cases of split-brain situations. |\n", @@ -5090,7 +5090,7 @@ ], "most_popular": false }, - "overview": "# Percona MySQL\n\nPlugin: go.d.plugin\nModule: mysql\n\n## Overview\n\nThis collector monitors the health and performance of MySQL servers and collects general statistics, replication and user metrics.\n\n\nIt connects to the MySQL instance via a TCP or UNIX socket and executes the following commands:\n\nExecuted queries:\n\n- `SELECT VERSION();`\n- `SHOW GLOBAL STATUS;`\n- `SHOW GLOBAL VARIABLES;`\n- `SHOW SLAVE STATUS;` or `SHOW ALL SLAVES STATUS;` (MariaDBv10.2+) or `SHOW REPLICA STATUS;` (MySQL 8.0.22+)\n- `SHOW USER_STATISTICS;` (MariaDBv10.1.1+)\n- `SELECT TIME,USER FROM INFORMATION_SCHEMA.PROCESSLIST;`\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nBy default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP and UNIX sockets:\n\n- /var/run/mysqld/mysqld.sock\n- /var/run/mysqld/mysql.sock\n- /var/lib/mysql/mysql.sock\n- /tmp/mysql.sock\n- 127.0.0.1:3306\n- \"[::1]:3306\"\n\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", + "overview": "# Percona MySQL\n\nPlugin: go.d.plugin\nModule: mysql\n\n## Overview\n\nThis collector monitors the health and performance of MySQL servers and collects general statistics, replication and user metrics.\n\n\nIt connects to the MySQL instance via a TCP or UNIX socket and executes the following commands:\n\nExecuted queries:\n\n- `SELECT VERSION();`\n- `SHOW GLOBAL STATUS;`\n- `SHOW GLOBAL VARIABLES;`\n- `SHOW SLAVE STATUS;` or `SHOW ALL SLAVES STATUS;` (MariaDBv10.2+) or `SHOW REPLICA STATUS;` (MySQL 8.0.22+)\n- `SHOW USER_STATISTICS;` (MariaDBv10.1.1+)\n- `SELECT TIME,USER FROM INFORMATION_SCHEMA.PROCESSLIST;`\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nBy default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP sockets:\n\n- 127.0.0.1:3306\n- \"[::1]:3306\"\n\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", "setup": "## Setup\n\n### Prerequisites\n\n#### Create netdata user\n\nA user account should have the\nfollowing [permissions](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html):\n\n- [`USAGE`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_usage)\n- [`REPLICATION CLIENT`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_replication-client)\n- [`PROCESS`](https://dev.mysql.com/doc/refman/8.0/en/privileges-provided.html#priv_process)\n\nTo create the `netdata` user with these permissions, execute the following in the MySQL shell:\n\n```mysql\nCREATE USER 'netdata'@'localhost';\nGRANT USAGE, REPLICATION CLIENT, PROCESS ON *.* TO 'netdata'@'localhost';\nFLUSH PRIVILEGES;\n```\n\nThe `netdata` user will have the ability to connect to the MySQL server on localhost without a password. It will only\nbe able to gather statistics without being able to alter or affect operations in any way.\n\n\n\n### Configuration\n\n#### File\n\nThe configuration file name for this integration is `go.d/mysql.conf`.\n\n\nYou can edit the configuration file using the `edit-config` script from the\nNetdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).\n\n```bash\ncd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata\nsudo ./edit-config go.d/mysql.conf\n```\n#### Options\n\nThe following options can be defined globally: update_every, autodetection_retry.\n\n\n| Name | Description | Default | Required |\n|:----|:-----------|:-------|:--------:|\n| update_every | Data collection frequency. | 5 | no |\n| autodetection_retry | Recheck interval in seconds. Zero means no recheck will be scheduled. | 0 | no |\n| dsn | MySQL server DSN (Data Source Name). See [DSN syntax](https://github.com/go-sql-driver/mysql#dsn-data-source-name). | root@tcp(localhost:3306)/ | yes |\n| my.cnf | Specifies the my.cnf file to read the connection settings from the [client] section. | | no |\n| timeout | Query timeout in seconds. | 1 | no |\n\n#### Examples\n\n##### TCP socket\n\nAn example configuration.\n\n```yaml\njobs:\n - name: local\n dsn: netdata@tcp(127.0.0.1:3306)/\n\n```\n##### Unix socket\n\nAn example configuration.\n\n```yaml\njobs:\n - name: local\n dsn: netdata@unix(/var/lib/mysql/mysql.sock)/\n\n```\n##### Connection with password\n\nAn example configuration.\n\n```yaml\njobs:\n - name: local\n dsn: netconfig:password@tcp(127.0.0.1:3306)/\n\n```\n##### my.cnf\n\nAn example configuration.\n\n```yaml\njobs:\n - name: local\n my.cnf: '/etc/my.cnf'\n\n```\n##### Multi-instance\n\n> **Note**: When you define multiple jobs, their names must be unique.\n\nLocal and remote instances.\n\n\n```yaml\njobs:\n - name: local\n dsn: netdata@tcp(127.0.0.1:3306)/\n\n - name: remote\n dsn: netconfig:password@tcp(203.0.113.0:3306)/\n\n```\n", "troubleshooting": "## Troubleshooting\n\n### Debug Mode\n\nTo troubleshoot issues with the `mysql` collector, run the `go.d.plugin` with the debug option enabled. The output\nshould give you clues as to why the collector isn't working.\n\n- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on\n your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`.\n\n ```bash\n cd /usr/libexec/netdata/plugins.d/\n ```\n\n- Switch to the `netdata` user.\n\n ```bash\n sudo -u netdata -s\n ```\n\n- Run the `go.d.plugin` to debug the collector:\n\n ```bash\n ./go.d.plugin -d -m mysql\n ```\n\n### Getting Logs\n\nIf you're encountering problems with the `mysql` collector, follow these steps to retrieve logs and identify potential issues:\n\n- **Run the command** specific to your system (systemd, non-systemd, or Docker container).\n- **Examine the output** for any warnings or error messages that might indicate issues. These messages should provide clues about the root cause of the problem.\n\n#### System with systemd\n\nUse the following command to view logs generated since the last Netdata service restart:\n\n```bash\njournalctl _SYSTEMD_INVOCATION_ID=\"$(systemctl show --value --property=InvocationID netdata)\" --namespace=netdata --grep mysql\n```\n\n#### System without systemd\n\nLocate the collector log file, typically at `/var/log/netdata/collector.log`, and use `grep` to filter for collector's name:\n\n```bash\ngrep mysql /var/log/netdata/collector.log\n```\n\n**Note**: This method shows logs from all restarts. Focus on the **latest entries** for troubleshooting current issues.\n\n#### Docker Container\n\nIf your Netdata runs in a Docker container named \"netdata\" (replace if different), use this command:\n\n```bash\ndocker logs netdata 2>&1 | grep mysql\n```\n\n", "alerts": "## Alerts\n\n\nThe following alerts are available:\n\n| Alert name | On metric | Description |\n|:------------|:----------|:------------|\n| [ mysql_10s_slow_queries ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.queries | number of slow queries in the last 10 seconds |\n| [ mysql_10s_table_locks_immediate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | number of table immediate locks in the last 10 seconds |\n| [ mysql_10s_table_locks_waited ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | number of table waited locks in the last 10 seconds |\n| [ mysql_10s_waited_locks_ratio ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.table_locks | ratio of waited table locks over the last 10 seconds |\n| [ mysql_connections ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.connections_active | client connections utilization |\n| [ mysql_replication ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.slave_status | replication status (0: stopped, 1: working) |\n| [ mysql_replication_lag ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.slave_behind | difference between the timestamp of the latest transaction processed by the SQL thread and the timestamp of the same transaction when it was processed on the master |\n| [ mysql_galera_cluster_size_max_2m ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_size | maximum galera cluster size in the last 2 minutes starting one minute ago |\n| [ mysql_galera_cluster_size ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_size | current galera cluster size, compared to the maximum size in the last 2 minutes |\n| [ mysql_galera_cluster_state_warn ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_state | galera node state is either Donor/Desynced or Joined |\n| [ mysql_galera_cluster_state_crit ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_state | galera node state is either Undefined or Joining or Error |\n| [ mysql_galera_cluster_status ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mysql.conf) | mysql.galera_cluster_status | galera node is part of a nonoperational component. This occurs in cases of multiple membership changes that result in a loss of Quorum or in cases of split-brain situations. |\n", diff --git a/src/go/plugin/go.d/modules/mysql/integrations/mariadb.md b/src/go/plugin/go.d/modules/mysql/integrations/mariadb.md index 1ee45f69766909..038119ed0376dd 100644 --- a/src/go/plugin/go.d/modules/mysql/integrations/mariadb.md +++ b/src/go/plugin/go.d/modules/mysql/integrations/mariadb.md @@ -45,12 +45,8 @@ This collector supports collecting metrics from multiple instances of this integ #### Auto-Detection -By default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP and UNIX sockets: +By default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP sockets: -- /var/run/mysqld/mysqld.sock -- /var/run/mysqld/mysql.sock -- /var/lib/mysql/mysql.sock -- /tmp/mysql.sock - 127.0.0.1:3306 - "[::1]:3306" diff --git a/src/go/plugin/go.d/modules/mysql/integrations/mysql.md b/src/go/plugin/go.d/modules/mysql/integrations/mysql.md index 03c9935f68b933..aadb0f5da18137 100644 --- a/src/go/plugin/go.d/modules/mysql/integrations/mysql.md +++ b/src/go/plugin/go.d/modules/mysql/integrations/mysql.md @@ -45,12 +45,8 @@ This collector supports collecting metrics from multiple instances of this integ #### Auto-Detection -By default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP and UNIX sockets: +By default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP sockets: -- /var/run/mysqld/mysqld.sock -- /var/run/mysqld/mysql.sock -- /var/lib/mysql/mysql.sock -- /tmp/mysql.sock - 127.0.0.1:3306 - "[::1]:3306" diff --git a/src/go/plugin/go.d/modules/mysql/integrations/percona_mysql.md b/src/go/plugin/go.d/modules/mysql/integrations/percona_mysql.md index 665ab0796a89a0..3f392b223fc83a 100644 --- a/src/go/plugin/go.d/modules/mysql/integrations/percona_mysql.md +++ b/src/go/plugin/go.d/modules/mysql/integrations/percona_mysql.md @@ -45,12 +45,8 @@ This collector supports collecting metrics from multiple instances of this integ #### Auto-Detection -By default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP and UNIX sockets: +By default, it detects instances running on localhost by trying to connect as root and netdata using known MySQL TCP sockets: -- /var/run/mysqld/mysqld.sock -- /var/run/mysqld/mysql.sock -- /var/lib/mysql/mysql.sock -- /tmp/mysql.sock - 127.0.0.1:3306 - "[::1]:3306" From 72c13bd405f2272a5b2643018818268bba1222e3 Mon Sep 17 00:00:00 2001 From: Costa Tsaousis Date: Sat, 13 Jul 2024 01:37:11 +0300 Subject: [PATCH 09/18] Spawn server fixes 6 (#18136) * spawn server reopens the log files after closing all sockets * close all sockets only when we run an external command * the spawn server disables all log methods except daemon and collectors --- src/daemon/commands.c | 2 +- src/libnetdata/log/log.c | 30 +++++++++++++++++++--- src/libnetdata/log/log.h | 3 ++- src/libnetdata/spawn_server/spawn_server.c | 27 +++++++++---------- 4 files changed, 44 insertions(+), 18 deletions(-) diff --git a/src/daemon/commands.c b/src/daemon/commands.c index 70ba11d424222e..2e1742a86948a4 100644 --- a/src/daemon/commands.c +++ b/src/daemon/commands.c @@ -164,7 +164,7 @@ static cmd_status_t cmd_reopen_logs_execute(char *args, char **message) (void)message; nd_log_limits_unlimited(); - nd_log_reopen_log_files(); + nd_log_reopen_log_files(true); nd_log_limits_reset(); return CMD_STATUS_SUCCESS; diff --git a/src/libnetdata/log/log.c b/src/libnetdata/log/log.c index 135d20f6f3763a..a31127c42d286e 100644 --- a/src/libnetdata/log/log.c +++ b/src/libnetdata/log/log.c @@ -992,14 +992,38 @@ void nd_log_initialize(void) { nd_log_open(&nd_log.sources[i], i); } -void nd_log_reopen_log_files(void) { - netdata_log_info("Reopening all log files."); +void nd_log_reopen_log_files(bool log) { + if(log) + netdata_log_info("Reopening all log files."); nd_log.std_output.initialized = false; nd_log.std_error.initialized = false; nd_log_initialize(); - netdata_log_info("Log files re-opened."); + if(log) + netdata_log_info("Log files re-opened."); +} + +void nd_log_reopen_log_files_for_spawn_server(void) { + if(nd_log.syslog.initialized) { + closelog(); + nd_log.syslog.initialized = false; + nd_log_syslog_init(); + } + + if(nd_log.journal_direct.initialized) { + close(nd_log.journal_direct.fd); + nd_log.journal_direct.fd = -1; + nd_log.journal_direct.initialized = false; + nd_log_journal_direct_init(NULL); + } + + nd_log.sources[NDLS_UNSET].method = NDLM_DISABLED; + nd_log.sources[NDLS_ACCESS].method = NDLM_DISABLED; + nd_log.sources[NDLS_ACLK].method = NDLM_DISABLED; + nd_log.sources[NDLS_DEBUG].method = NDLM_DISABLED; + nd_log.sources[NDLS_HEALTH].method = NDLM_DISABLED; + nd_log_reopen_log_files(false); } void chown_open_file(int fd, uid_t uid, gid_t gid) { diff --git a/src/libnetdata/log/log.h b/src/libnetdata/log/log.h index 7517d9d667fbaa..015c02eb64aaf4 100644 --- a/src/libnetdata/log/log.h +++ b/src/libnetdata/log/log.h @@ -149,11 +149,12 @@ void nd_log_set_user_settings(ND_LOG_SOURCES source, const char *setting); void nd_log_set_facility(const char *facility); void nd_log_set_priority_level(const char *setting); void nd_log_initialize(void); -void nd_log_reopen_log_files(void); +void nd_log_reopen_log_files(bool log); void chown_open_file(int fd, uid_t uid, gid_t gid); void nd_log_chown_log_files(uid_t uid, gid_t gid); void nd_log_set_flood_protection(size_t logs, time_t period); void nd_log_initialize_for_external_plugins(const char *name); +void nd_log_reopen_log_files_for_spawn_server(void); bool nd_log_journal_socket_available(void); ND_LOG_FIELD_ID nd_log_field_id_by_name(const char *field, size_t len); int nd_log_priority2id(const char *priority); diff --git a/src/libnetdata/spawn_server/spawn_server.c b/src/libnetdata/spawn_server/spawn_server.c index 8e58e9ece3bb5b..ef6755c324decd 100644 --- a/src/libnetdata/spawn_server/spawn_server.c +++ b/src/libnetdata/spawn_server/spawn_server.c @@ -59,12 +59,14 @@ SPAWN_SERVER* spawn_server_create(SPAWN_SERVER_OPTIONS options __maybe_unused, c SPAWN_SERVER* server = callocz(1, sizeof(SPAWN_SERVER)); if(name) server->name = strdupz(name); + else + server->name = strdupz("unnamed"); return server; } void spawn_server_destroy(SPAWN_SERVER *server) { if (server) { - if(server->name) freez((void *)server->name); + freez((void *)server->name); freez(server); } } @@ -329,20 +331,13 @@ static void spawn_server_run_child(SPAWN_SERVER *server, SPAWN_REQUEST *rq) { if(server->pipe[1] != -1) { close(server->pipe[1]); server->pipe[1] = -1; } // set the process name - { - char buf[15]; - snprintfz(buf, sizeof(buf), "chld-%zu-r%zu", server->id, rq->request_id); - os_setproctitle(buf, server->argc, server->argv); - } - - // just a precausion in case we have any left-over fds - os_close_all_non_std_open_fds_except(rq->fds, SPAWN_SERVER_TRANSFER_FDS); + os_setproctitle("spawn-child", server->argc, server->argv); // get the fds from the request int stdin_fd = rq->fds[0]; int stdout_fd = rq->fds[1]; int stderr_fd = rq->fds[2]; - int custom_fd = rq->fds[3]; + int custom_fd = rq->fds[3]; (void)custom_fd; // change stdio fds to the ones in the request if (dup2(stdin_fd, STDIN_FILENO) == -1) { @@ -376,11 +371,16 @@ static void spawn_server_run_child(SPAWN_SERVER *server, SPAWN_REQUEST *rq) { switch (rq->type) { case SPAWN_INSTANCE_TYPE_EXEC: - if(custom_fd != -1) { close(custom_fd); custom_fd = -1; } + // close all fds except the ones we need + os_close_all_non_std_open_fds_except(NULL, 0); + + // run the command execvp(rq->argv[0], (char **)rq->argv); + nd_log(NDLS_COLLECTORS, NDLP_ERR, "SPAWN SERVER: Failed to execute command of request No %zu: %s", rq->request_id, rq->cmdline); + exit(1); break; @@ -1135,6 +1135,7 @@ static void spawn_server_event_loop(SPAWN_SERVER *server) { if (spawn_server_sigchld) { spawn_server_sigchld = false; spawn_server_process_sigchld(); + errno_clear(); if(ret == -1) continue; @@ -1310,8 +1311,7 @@ SPAWN_SERVER* spawn_server_create(SPAWN_SERVER_OPTIONS options, const char *name snprintf(path, sizeof(path), "%s/.netdata-spawn-%s.sock", runtime_directory, name); } else { - snprintfz(path, sizeof(path), "%d-%zu", getpid(), server->id); - server->name = strdupz(path); + server->name = strdupz("unnamed"); snprintf(path, sizeof(path), "%s/.netdata-spawn-%d-%zu.sock", runtime_directory, getpid(), server->id); } @@ -1337,6 +1337,7 @@ SPAWN_SERVER* spawn_server_create(SPAWN_SERVER_OPTIONS options, const char *name replace_stdio_with_dev_null(); os_close_all_non_std_open_fds_except((int[]){ server->sock, server->pipe[1] }, 2); + nd_log_reopen_log_files_for_spawn_server(); spawn_server_event_loop(server); } else if (pid > 0) { From 82e189d8a0cf83c4d44130d5bc0ce2939d7bcb89 Mon Sep 17 00:00:00 2001 From: netdatabot Date: Sat, 13 Jul 2024 00:18:32 +0000 Subject: [PATCH 10/18] [ci skip] Update changelog and version for nightly build: v1.46.0-149-nightly. --- CHANGELOG.md | 19 +++++++++---------- packaging/version | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 387b50f6b2e063..3b496425372a03 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,15 @@ **Merged pull requests:** +- Spawn server fixes 6 [\#18136](https://github.com/netdata/netdata/pull/18136) ([ktsaou](https://github.com/ktsaou)) +- Regenerate integrations.js [\#18135](https://github.com/netdata/netdata/pull/18135) ([netdatabot](https://github.com/netdatabot)) +- docs: go.d mysql: remove unix sockets from auto\_detection [\#18134](https://github.com/netdata/netdata/pull/18134) ([ilyam8](https://github.com/ilyam8)) +- go.d fix url path overwrite [\#18132](https://github.com/netdata/netdata/pull/18132) ([ilyam8](https://github.com/ilyam8)) +- Spawn server improvements 5 [\#18131](https://github.com/netdata/netdata/pull/18131) ([ktsaou](https://github.com/ktsaou)) +- Spawn server fixes No 4 [\#18127](https://github.com/netdata/netdata/pull/18127) ([ktsaou](https://github.com/ktsaou)) +- go.d filecheck fix dir existence chart label [\#18126](https://github.com/netdata/netdata/pull/18126) ([ilyam8](https://github.com/ilyam8)) +- Regenerate integrations.js [\#18124](https://github.com/netdata/netdata/pull/18124) ([netdatabot](https://github.com/netdatabot)) +- go.d whoisquery fix "days until" in config\_schema.json [\#18121](https://github.com/netdata/netdata/pull/18121) ([ilyam8](https://github.com/ilyam8)) - go.d smartctl: add scsi read/write/verify error rate [\#18119](https://github.com/netdata/netdata/pull/18119) ([ilyam8](https://github.com/ilyam8)) - log in the same line [\#18118](https://github.com/netdata/netdata/pull/18118) ([ktsaou](https://github.com/ktsaou)) - spawn server fixes 3 [\#18117](https://github.com/netdata/netdata/pull/18117) ([ktsaou](https://github.com/ktsaou)) @@ -403,16 +412,6 @@ - Cpack fixes [\#17576](https://github.com/netdata/netdata/pull/17576) ([vkalintiris](https://github.com/vkalintiris)) - Fix compilation without `dbengine` [\#17575](https://github.com/netdata/netdata/pull/17575) ([thiagoftsm](https://github.com/thiagoftsm)) - Fix handling of netdata.conf on install in build system. [\#17572](https://github.com/netdata/netdata/pull/17572) ([Ferroin](https://github.com/Ferroin)) -- Update Netdata subscription plans documentation [\#17571](https://github.com/netdata/netdata/pull/17571) ([Ancairon](https://github.com/Ancairon)) -- go.d prometheus remove apostrophe in label values [\#17570](https://github.com/netdata/netdata/pull/17570) ([ilyam8](https://github.com/ilyam8)) -- remove go.d symbol/debug info with RelWithDebInfo [\#17569](https://github.com/netdata/netdata/pull/17569) ([ilyam8](https://github.com/ilyam8)) -- go.d smartctl add meta setup prerequisites [\#17568](https://github.com/netdata/netdata/pull/17568) ([ilyam8](https://github.com/ilyam8)) -- Regenerate integrations.js [\#17567](https://github.com/netdata/netdata/pull/17567) ([netdatabot](https://github.com/netdatabot)) -- Increase the message size to the spawn server [\#17566](https://github.com/netdata/netdata/pull/17566) ([stelfrag](https://github.com/stelfrag)) -- go.d smartctl small improvements [\#17565](https://github.com/netdata/netdata/pull/17565) ([ilyam8](https://github.com/ilyam8)) -- go.d smartctl improve units [\#17564](https://github.com/netdata/netdata/pull/17564) ([ilyam8](https://github.com/ilyam8)) -- Regenerate integrations.js [\#17561](https://github.com/netdata/netdata/pull/17561) ([netdatabot](https://github.com/netdatabot)) -- Regenerate integrations.js [\#17560](https://github.com/netdata/netdata/pull/17560) ([netdatabot](https://github.com/netdatabot)) ## [v1.45.6](https://github.com/netdata/netdata/tree/v1.45.6) (2024-06-05) diff --git a/packaging/version b/packaging/version index 0e8cde4e778732..f0f7e39dd06c0a 100644 --- a/packaging/version +++ b/packaging/version @@ -1 +1 @@ -v1.46.0-139-nightly +v1.46.0-149-nightly From 92b00666e19594d77b17fce91ef849ef9cbb3baf Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Sat, 13 Jul 2024 23:19:38 +0300 Subject: [PATCH 11/18] go.d smartctl add "extra_devices" option (#18140) --- .../plugin/go.d/modules/smartctl/collect.go | 5 +- .../go.d/modules/smartctl/config_schema.json | 45 ++++++- src/go/plugin/go.d/modules/smartctl/init.go | 7 ++ .../go.d/modules/smartctl/metadata.yaml | 13 ++ src/go/plugin/go.d/modules/smartctl/scan.go | 15 ++- .../plugin/go.d/modules/smartctl/smartctl.go | 23 ++-- .../go.d/modules/smartctl/smartctl_test.go | 37 +++++- .../modules/smartctl/testdata/config.json | 8 +- .../modules/smartctl/testdata/config.yaml | 5 +- .../testdata/type-nvme/device-nvme1.json | 113 ++++++++++++++++++ 10 files changed, 252 insertions(+), 19 deletions(-) create mode 100644 src/go/plugin/go.d/modules/smartctl/testdata/type-nvme/device-nvme1.json diff --git a/src/go/plugin/go.d/modules/smartctl/collect.go b/src/go/plugin/go.d/modules/smartctl/collect.go index 0f53f0bcfddb4d..e8f8357d680c56 100644 --- a/src/go/plugin/go.d/modules/smartctl/collect.go +++ b/src/go/plugin/go.d/modules/smartctl/collect.go @@ -42,7 +42,8 @@ func (s *Smartctl) collect() (map[string]int64, error) { // TODO: make it concurrent for _, d := range s.scannedDevices { if err := s.collectScannedDevice(mx, d); err != nil { - return nil, err + s.Warning(err) + continue } } @@ -57,7 +58,7 @@ func (s *Smartctl) collect() (map[string]int64, error) { func (s *Smartctl) collectScannedDevice(mx map[string]int64, scanDev *scanDevice) error { resp, err := s.exec.deviceInfo(scanDev.name, scanDev.typ, s.NoCheckPowerMode) if err != nil { - if resp != nil && isDeviceOpenFailedNoSuchDevice(resp) { + if resp != nil && isDeviceOpenFailedNoSuchDevice(resp) && !scanDev.extra { s.Infof("smartctl reported that device '%s' type '%s' no longer exists", scanDev.name, scanDev.typ) s.forceScan = true return nil diff --git a/src/go/plugin/go.d/modules/smartctl/config_schema.json b/src/go/plugin/go.d/modules/smartctl/config_schema.json index 8093cc5f8f943e..e03f8081da0950 100644 --- a/src/go/plugin/go.d/modules/smartctl/config_schema.json +++ b/src/go/plugin/go.d/modules/smartctl/config_schema.json @@ -50,6 +50,36 @@ "type": "string", "minimum": 1, "default": "*" + }, + "extra_devices": { + "title": "Extra devices", + "description": "Allows manual specification of devices not automatically detected by `smartctl --scan`. Each device entry must include both a name and a type.", + "type": [ + "array", + "null" + ], + "uniqueItems": true, + "items": { + "title": "Device", + "type": [ + "object", + "null" + ], + "required": [ + "name", + "type" + ], + "properties": { + "name": { + "title": "Name", + "type": "string" + }, + "type": { + "title": "Type", + "type": "string" + } + } + } } }, "additionalProperties": false, @@ -75,9 +105,10 @@ ] }, { - "title": "Filtering", + "title": "Devices", "fields": [ - "device_selector" + "device_selector", + "extra_devices" ] } ] @@ -94,6 +125,16 @@ }, "device_selector": { "ui:help": "Leave blank or use `*` to collect data for all devices." + }, + "extra_devices": { + "items": { + "name": { + "ui:placeholder": "/dev/sda" + }, + "type": { + "ui:placeholder": "jmb39x-q,3" + } + } } } } diff --git a/src/go/plugin/go.d/modules/smartctl/init.go b/src/go/plugin/go.d/modules/smartctl/init.go index a8c8bb017dd1fa..6d3731a1801489 100644 --- a/src/go/plugin/go.d/modules/smartctl/init.go +++ b/src/go/plugin/go.d/modules/smartctl/init.go @@ -17,6 +17,13 @@ func (s *Smartctl) validateConfig() error { default: return fmt.Errorf("invalid power mode '%s'", s.NoCheckPowerMode) } + + for _, v := range s.ExtraDevices { + if v.Name == "" || v.Type == "" { + return fmt.Errorf("invalid extra device: name and type must both be provided, got name='%s' type='%s'", v.Name, v.Type) + } + } + return nil } diff --git a/src/go/plugin/go.d/modules/smartctl/metadata.yaml b/src/go/plugin/go.d/modules/smartctl/metadata.yaml index ade5c2162ac6e6..0b54f69fbecf90 100644 --- a/src/go/plugin/go.d/modules/smartctl/metadata.yaml +++ b/src/go/plugin/go.d/modules/smartctl/metadata.yaml @@ -108,6 +108,10 @@ modules: description: "Specifies a pattern to match the 'info name' of devices as reported by `smartctl --scan --json`." default_value: "*" required: false + - name: extra_devices + description: "Allows manual specification of devices not automatically detected by `smartctl --scan`. Each device entry must include both a name and a type. See \"Configuration Examples\" for details." + default_value: "[]" + required: false - name: no_check_power_mode description: "Skip data collection when the device is in a low-power mode. Prevents unnecessary disk spin-up." default_value: standby @@ -132,6 +136,15 @@ modules: jobs: - name: smartctl devices_poll_interval: 60 # Collect S.M.A.R.T statistics every 60 seconds + - name: Extra devices + description: | + This example demonstrates using `extra_devices` to manually add a storage device (`/dev/sdc`) not automatically detected by `smartctl --scan`. + config: | + jobs: + - name: smartctl + extra_devices: + - name: /dev/sdc + type: jmb39x-q,3 troubleshooting: problems: list: [] diff --git a/src/go/plugin/go.d/modules/smartctl/scan.go b/src/go/plugin/go.d/modules/smartctl/scan.go index 9310938f673f91..e4291be4f91889 100644 --- a/src/go/plugin/go.d/modules/smartctl/scan.go +++ b/src/go/plugin/go.d/modules/smartctl/scan.go @@ -12,6 +12,7 @@ type scanDevice struct { name string infoName string typ string + extra bool // added via config "extra_devices" } func (s *scanDevice) key() string { @@ -65,11 +66,21 @@ func (s *Smartctl) scanDevices() (map[string]*scanDevice, error) { devices[dev.key()] = dev } + s.Debugf("smartctl scan found %d devices", len(devices)) + + for _, v := range s.ExtraDevices { + if v.Name == "" || v.Type == "" { + continue + } + dev := &scanDevice{name: v.Name, typ: v.Type, extra: true} + if _, ok := devices[dev.key()]; !ok { + devices[dev.key()] = dev + } + } + if len(devices) == 0 { return nil, errors.New("no devices found during scan") } - s.Debugf("smartctl scan found %d devices", len(devices)) - return devices, nil } diff --git a/src/go/plugin/go.d/modules/smartctl/smartctl.go b/src/go/plugin/go.d/modules/smartctl/smartctl.go index b64e99c22a7b4c..1ea1a8fbaf8cc0 100644 --- a/src/go/plugin/go.d/modules/smartctl/smartctl.go +++ b/src/go/plugin/go.d/modules/smartctl/smartctl.go @@ -43,14 +43,21 @@ func New() *Smartctl { } } -type Config struct { - UpdateEvery int `yaml:"update_every,omitempty" json:"update_every"` - Timeout web.Duration `yaml:"timeout,omitempty" json:"timeout"` - ScanEvery web.Duration `yaml:"scan_every,omitempty" json:"scan_every"` - PollDevicesEvery web.Duration `yaml:"poll_devices_every,omitempty" json:"poll_devices_every"` - NoCheckPowerMode string `yaml:"no_check_power_mode,omitempty" json:"no_check_power_mode"` - DeviceSelector string `yaml:"device_selector,omitempty" json:"device_selector"` -} +type ( + Config struct { + UpdateEvery int `yaml:"update_every,omitempty" json:"update_every"` + Timeout web.Duration `yaml:"timeout,omitempty" json:"timeout"` + ScanEvery web.Duration `yaml:"scan_every,omitempty" json:"scan_every"` + PollDevicesEvery web.Duration `yaml:"poll_devices_every,omitempty" json:"poll_devices_every"` + NoCheckPowerMode string `yaml:"no_check_power_mode,omitempty" json:"no_check_power_mode"` + DeviceSelector string `yaml:"device_selector,omitempty" json:"device_selector"` + ExtraDevices []ConfigExtraDevice `yaml:"extra_devices,omitempty" json:"extra_devices"` + } + ConfigExtraDevice struct { + Name string `yaml:"name" json:"name"` + Type string `yaml:"type" json:"type"` + } +) type ( Smartctl struct { diff --git a/src/go/plugin/go.d/modules/smartctl/smartctl_test.go b/src/go/plugin/go.d/modules/smartctl/smartctl_test.go index 2008f0ac4d4a1f..2f8d4d66a13229 100644 --- a/src/go/plugin/go.d/modules/smartctl/smartctl_test.go +++ b/src/go/plugin/go.d/modules/smartctl/smartctl_test.go @@ -26,6 +26,7 @@ var ( dataTypeNvmeScan, _ = os.ReadFile("testdata/type-nvme/scan.json") dataTypeNvmeDeviceNvme0, _ = os.ReadFile("testdata/type-nvme/device-nvme0.json") + dataTypeNvmeDeviceNvme1, _ = os.ReadFile("testdata/type-nvme/device-nvme1.json") dataTypeScsiScan, _ = os.ReadFile("testdata/type-scsi/scan.json") dataTypeScsiDeviceSda, _ = os.ReadFile("testdata/type-scsi/device-sda.json") @@ -42,6 +43,7 @@ func Test_testDataIsValid(t *testing.T) { "dataTypeNvmeScan": dataTypeNvmeScan, "dataTypeNvmeDeviceNvme0": dataTypeNvmeDeviceNvme0, + "dataTypeNvmeDeviceNvme1": dataTypeNvmeDeviceNvme1, "dataTypeScsiScan": dataTypeScsiScan, "dataTypeScsiDeviceSda": dataTypeScsiDeviceSda, @@ -166,9 +168,10 @@ func TestSmartctl_Check(t *testing.T) { func TestSmartctl_Collect(t *testing.T) { tests := map[string]struct { - prepareMock func() *mockSmartctlCliExec - wantMetrics map[string]int64 - wantCharts int + prepareMock func() *mockSmartctlCliExec + prepareConfig func() Config + wantMetrics map[string]int64 + wantCharts int }{ "success type sata devices": { prepareMock: prepareMockOkTypeSata, @@ -295,6 +298,29 @@ func TestSmartctl_Collect(t *testing.T) { "device_nvme0_type_nvme_temperature": 39, }, }, + "success type nvme devices with extra": { + prepareMock: prepareMockOkTypeNvme, + prepareConfig: func() Config { + cfg := New().Config + cfg.ExtraDevices = []ConfigExtraDevice{ + {Name: "/dev/nvme1", Type: "nvme"}, + } + return cfg + }, + wantCharts: 8, + wantMetrics: map[string]int64{ + "device_nvme0_type_nvme_power_cycle_count": 2, + "device_nvme0_type_nvme_power_on_time": 11206800, + "device_nvme0_type_nvme_smart_status_failed": 0, + "device_nvme0_type_nvme_smart_status_passed": 1, + "device_nvme0_type_nvme_temperature": 39, + "device_nvme1_type_nvme_power_cycle_count": 5, + "device_nvme1_type_nvme_power_on_time": 17038800, + "device_nvme1_type_nvme_smart_status_failed": 0, + "device_nvme1_type_nvme_smart_status_passed": 1, + "device_nvme1_type_nvme_temperature": 36, + }, + }, "success type scsi devices": { prepareMock: prepareMockOkTypeScsi, wantCharts: 7, @@ -326,6 +352,9 @@ func TestSmartctl_Collect(t *testing.T) { for name, test := range tests { t.Run(name, func(t *testing.T) { smart := New() + if test.prepareConfig != nil { + smart.Config = test.prepareConfig() + } mock := test.prepareMock() smart.exec = mock smart.ScanEvery = web.Duration(time.Microsecond * 1) @@ -390,6 +419,8 @@ func prepareMockOkTypeNvme() *mockSmartctlCliExec { switch deviceName { case "/dev/nvme0": return dataTypeNvmeDeviceNvme0, nil + case "/dev/nvme1": + return dataTypeNvmeDeviceNvme1, nil default: return nil, fmt.Errorf("unexpected device name %s", deviceName) } diff --git a/src/go/plugin/go.d/modules/smartctl/testdata/config.json b/src/go/plugin/go.d/modules/smartctl/testdata/config.json index ed26105eef2075..41c69da51952dd 100644 --- a/src/go/plugin/go.d/modules/smartctl/testdata/config.json +++ b/src/go/plugin/go.d/modules/smartctl/testdata/config.json @@ -4,5 +4,11 @@ "scan_every": 123.123, "poll_devices_every": 123.123, "no_check_power_mode": "ok", - "device_selector": "ok" + "device_selector": "ok", + "extra_devices": [ + { + "name": "ok", + "type": "ok" + } + ] } diff --git a/src/go/plugin/go.d/modules/smartctl/testdata/config.yaml b/src/go/plugin/go.d/modules/smartctl/testdata/config.yaml index 94c9b0acd581df..b0b77d53d77b06 100644 --- a/src/go/plugin/go.d/modules/smartctl/testdata/config.yaml +++ b/src/go/plugin/go.d/modules/smartctl/testdata/config.yaml @@ -3,4 +3,7 @@ timeout: 123.123 scan_every: 123.123 poll_devices_every: 123.123 no_check_power_mode: "ok" -"device_selector": "ok" +device_selector: "ok" +extra_devices: + - name: "ok" + type: "ok" diff --git a/src/go/plugin/go.d/modules/smartctl/testdata/type-nvme/device-nvme1.json b/src/go/plugin/go.d/modules/smartctl/testdata/type-nvme/device-nvme1.json new file mode 100644 index 00000000000000..37faf7cfee3dea --- /dev/null +++ b/src/go/plugin/go.d/modules/smartctl/testdata/type-nvme/device-nvme1.json @@ -0,0 +1,113 @@ +{ + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 3 + ], + "svn_revision": "5338", + "platform_info": "REDACTED", + "build_info": "(local build)", + "argv": [ + "smartctl", + "--json", + "--all", + "/dev/nvme1", + "--device", + "nvme" + ], + "exit_status": 0 + }, + "local_time": { + "time_t": 1720897758, + "asctime": "Sat Jul 13 22:09:18 2024 EEST" + }, + "device": { + "name": "/dev/nvme1", + "info_name": "/dev/nvme1", + "type": "nvme", + "protocol": "NVMe" + }, + "model_name": "Seagate FireCuda 530 ZP4000GM30023", + "serial_number": "REDACTED", + "firmware_version": "REDACTED", + "nvme_pci_vendor": { + "id": 7089, + "subsystem_id": 7089 + }, + "nvme_ieee_oui_identifier": 6584743, + "nvme_total_capacity": 4000787030016, + "nvme_unallocated_capacity": 0, + "nvme_controller_id": 1, + "nvme_version": { + "string": "1.4", + "value": 66560 + }, + "nvme_number_of_namespaces": 1, + "nvme_namespaces": [ + { + "id": 1, + "size": { + "blocks": 7814037168, + "bytes": 4000787030016 + }, + "capacity": { + "blocks": 7814037168, + "bytes": 4000787030016 + }, + "utilization": { + "blocks": 7814037168, + "bytes": 4000787030016 + }, + "formatted_lba_size": 512, + "eui64": { + "oui": 6584743, + "ext_id": 553497146765 + } + } + ], + "user_capacity": { + "blocks": 7814037168, + "bytes": 4000787030016 + }, + "logical_block_size": 512, + "smart_support": { + "available": true, + "enabled": true + }, + "smart_status": { + "passed": true, + "nvme": { + "value": 0 + } + }, + "nvme_smart_health_information_log": { + "critical_warning": 0, + "temperature": 36, + "available_spare": 100, + "available_spare_threshold": 5, + "percentage_used": 0, + "data_units_read": 202, + "data_units_written": 0, + "host_reads": 2509, + "host_writes": 0, + "controller_busy_time": 0, + "power_cycles": 5, + "power_on_hours": 4733, + "unsafe_shutdowns": 2, + "media_errors": 0, + "num_err_log_entries": 20, + "warning_temp_time": 0, + "critical_comp_time": 0 + }, + "temperature": { + "current": 36 + }, + "power_cycle_count": 5, + "power_on_time": { + "hours": 4733 + } +} From 7150f99481b28c0f9fede07fc33f43d138eb2628 Mon Sep 17 00:00:00 2001 From: Netdata bot <43409846+netdatabot@users.noreply.github.com> Date: Sat, 13 Jul 2024 16:25:27 -0400 Subject: [PATCH 12/18] Regenerate integrations.js (#18141) Co-authored-by: ilyam8 <22274335+ilyam8@users.noreply.github.com> --- integrations/integrations.js | 2 +- integrations/integrations.json | 2 +- .../smartctl/integrations/s.m.a.r.t..md | 18 ++++++++++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/integrations/integrations.js b/integrations/integrations.js index 85ca3b0207f95e..1ae51ee44fa91d 100644 --- a/integrations/integrations.js +++ b/integrations/integrations.js @@ -16102,7 +16102,7 @@ export const integrations = [ "most_popular": false }, "overview": "# S.M.A.R.T.\n\nPlugin: go.d.plugin\nModule: smartctl\n\n## Overview\n\nThis collector monitors the health status of storage devices by analyzing S.M.A.R.T. (Self-Monitoring, Analysis, and Reporting Technology) counters.\nIt relies on the [`smartctl`](https://linux.die.net/man/8/smartctl) CLI tool but avoids directly executing the binary.\nInstead, it utilizes `ndsudo`, a Netdata helper specifically designed to run privileged commands securely within the Netdata environment.\nThis approach eliminates the need to use `sudo`, improving security and potentially simplifying permission management.\n\nExecuted commands:\n- `smartctl --json --scan`\n- `smartctl --json --all {deviceName} --device {deviceType} --nocheck {powerMode}`\n\n\n\n\nThis collector is supported on all platforms.\n\nThis collector only supports collecting metrics from a single instance of this integration.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nThis integration doesn't support auto-detection.\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", - "setup": "## Setup\n\n### Prerequisites\n\n#### Install smartmontools (v7.0+)\n\nInstall `smartmontools` version 7.0 or later using your distribution's package manager. Version 7.0 introduced the `--json` output mode, which is required for this collector to function properly.\n\n\n#### For Netdata running in a Docker container\n\nNetdata requires the `SYS_RAWIO` capability and access to the storage devices to run the `smartctl` collector inside a Docker container. Here's how you can achieve this:\n\n- `docker run`\n\n ```bash\n docker run --cap-add SYS_RAWIO --device /dev/sda:/dev/sda ...\n ```\n\n- `docker-compose.yml`\n\n ```yaml\n services:\n netdata:\n cap_add:\n - SYS_PTRACE\n - SYS_ADMIN\n - SYS_RAWIO # smartctl\n devices:\n - \"/dev/sda:/dev/sda\"\n ```\n\n> **Multiple Devices**: These examples only show mapping of one device (/dev/sda). You'll need to add additional `--device` options (in docker run) or entries in the `devices` list (in docker-compose.yml) for each storage device you want Netdata's smartctl collector to monitor.\n\n> **NVMe Devices**: Do not map NVMe devices using this method. Netdata uses a [dedicated collector](https://github.com/netdata/netdata/tree/master/src/go/plugin/go.d/modules/nvme#readme) to monitor NVMe devices.\n\n\n\n### Configuration\n\n#### File\n\nThe configuration file name for this integration is `go.d/smartctl.conf`.\n\n\nYou can edit the configuration file using the `edit-config` script from the\nNetdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).\n\n```bash\ncd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata\nsudo ./edit-config go.d/smartctl.conf\n```\n#### Options\n\nThe following options can be defined globally: update_every.\n\n\n{% details open=true summary=\"Config options\" %}\n| Name | Description | Default | Required |\n|:----|:-----------|:-------|:--------:|\n| update_every | interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**. | 10 | no |\n| timeout | smartctl binary execution timeout. | 5 | no |\n| scan_every | interval for discovering new devices using `smartctl --scan`, measured in seconds. | 900 | no |\n| poll_devices_every | interval for gathering data for every device, measured in seconds. Data is cached for this interval. | 300 | no |\n| device_selector | Specifies a pattern to match the 'info name' of devices as reported by `smartctl --scan --json`. | * | no |\n| no_check_power_mode | Skip data collection when the device is in a low-power mode. Prevents unnecessary disk spin-up. | standby | no |\n\n##### no_check_power_mode\n\nThe valid arguments to this option are:\n\n| Mode | Description |\n|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| never | Check the device always. |\n| sleep | Check the device unless it is in SLEEP mode. |\n| standby | Check the device unless it is in SLEEP or STANDBY mode. In these modes most disks are not spinning, so if you want to prevent a disk from spinning up, this is probably what you want. |\n| idle | Check the device unless it is in SLEEP, STANDBY or IDLE mode. In the IDLE state, most disks are still spinning, so this is probably not what you want. |\n\n\n{% /details %}\n#### Examples\n\n##### Custom devices poll interval\n\nAllows you to override the default devices poll interval (data collection).\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: smartctl\n devices_poll_interval: 60 # Collect S.M.A.R.T statistics every 60 seconds\n\n```\n{% /details %}\n", + "setup": "## Setup\n\n### Prerequisites\n\n#### Install smartmontools (v7.0+)\n\nInstall `smartmontools` version 7.0 or later using your distribution's package manager. Version 7.0 introduced the `--json` output mode, which is required for this collector to function properly.\n\n\n#### For Netdata running in a Docker container\n\nNetdata requires the `SYS_RAWIO` capability and access to the storage devices to run the `smartctl` collector inside a Docker container. Here's how you can achieve this:\n\n- `docker run`\n\n ```bash\n docker run --cap-add SYS_RAWIO --device /dev/sda:/dev/sda ...\n ```\n\n- `docker-compose.yml`\n\n ```yaml\n services:\n netdata:\n cap_add:\n - SYS_PTRACE\n - SYS_ADMIN\n - SYS_RAWIO # smartctl\n devices:\n - \"/dev/sda:/dev/sda\"\n ```\n\n> **Multiple Devices**: These examples only show mapping of one device (/dev/sda). You'll need to add additional `--device` options (in docker run) or entries in the `devices` list (in docker-compose.yml) for each storage device you want Netdata's smartctl collector to monitor.\n\n> **NVMe Devices**: Do not map NVMe devices using this method. Netdata uses a [dedicated collector](https://github.com/netdata/netdata/tree/master/src/go/plugin/go.d/modules/nvme#readme) to monitor NVMe devices.\n\n\n\n### Configuration\n\n#### File\n\nThe configuration file name for this integration is `go.d/smartctl.conf`.\n\n\nYou can edit the configuration file using the `edit-config` script from the\nNetdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).\n\n```bash\ncd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata\nsudo ./edit-config go.d/smartctl.conf\n```\n#### Options\n\nThe following options can be defined globally: update_every.\n\n\n{% details open=true summary=\"Config options\" %}\n| Name | Description | Default | Required |\n|:----|:-----------|:-------|:--------:|\n| update_every | interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**. | 10 | no |\n| timeout | smartctl binary execution timeout. | 5 | no |\n| scan_every | interval for discovering new devices using `smartctl --scan`, measured in seconds. | 900 | no |\n| poll_devices_every | interval for gathering data for every device, measured in seconds. Data is cached for this interval. | 300 | no |\n| device_selector | Specifies a pattern to match the 'info name' of devices as reported by `smartctl --scan --json`. | * | no |\n| extra_devices | Allows manual specification of devices not automatically detected by `smartctl --scan`. Each device entry must include both a name and a type. See \"Configuration Examples\" for details. | [] | no |\n| no_check_power_mode | Skip data collection when the device is in a low-power mode. Prevents unnecessary disk spin-up. | standby | no |\n\n##### no_check_power_mode\n\nThe valid arguments to this option are:\n\n| Mode | Description |\n|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| never | Check the device always. |\n| sleep | Check the device unless it is in SLEEP mode. |\n| standby | Check the device unless it is in SLEEP or STANDBY mode. In these modes most disks are not spinning, so if you want to prevent a disk from spinning up, this is probably what you want. |\n| idle | Check the device unless it is in SLEEP, STANDBY or IDLE mode. In the IDLE state, most disks are still spinning, so this is probably not what you want. |\n\n\n{% /details %}\n#### Examples\n\n##### Custom devices poll interval\n\nAllows you to override the default devices poll interval (data collection).\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: smartctl\n devices_poll_interval: 60 # Collect S.M.A.R.T statistics every 60 seconds\n\n```\n{% /details %}\n##### Extra devices\n\nThis example demonstrates using `extra_devices` to manually add a storage device (`/dev/sdc`) not automatically detected by `smartctl --scan`.\n\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: smartctl\n extra_devices:\n - name: /dev/sdc\n type: jmb39x-q,3\n\n```\n{% /details %}\n", "troubleshooting": "## Troubleshooting\n\n### Debug Mode\n\nTo troubleshoot issues with the `smartctl` collector, run the `go.d.plugin` with the debug option enabled. The output\nshould give you clues as to why the collector isn't working.\n\n- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on\n your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`.\n\n ```bash\n cd /usr/libexec/netdata/plugins.d/\n ```\n\n- Switch to the `netdata` user.\n\n ```bash\n sudo -u netdata -s\n ```\n\n- Run the `go.d.plugin` to debug the collector:\n\n ```bash\n ./go.d.plugin -d -m smartctl\n ```\n\n### Getting Logs\n\nIf you're encountering problems with the `smartctl` collector, follow these steps to retrieve logs and identify potential issues:\n\n- **Run the command** specific to your system (systemd, non-systemd, or Docker container).\n- **Examine the output** for any warnings or error messages that might indicate issues. These messages should provide clues about the root cause of the problem.\n\n#### System with systemd\n\nUse the following command to view logs generated since the last Netdata service restart:\n\n```bash\njournalctl _SYSTEMD_INVOCATION_ID=\"$(systemctl show --value --property=InvocationID netdata)\" --namespace=netdata --grep smartctl\n```\n\n#### System without systemd\n\nLocate the collector log file, typically at `/var/log/netdata/collector.log`, and use `grep` to filter for collector's name:\n\n```bash\ngrep smartctl /var/log/netdata/collector.log\n```\n\n**Note**: This method shows logs from all restarts. Focus on the **latest entries** for troubleshooting current issues.\n\n#### Docker Container\n\nIf your Netdata runs in a Docker container named \"netdata\" (replace if different), use this command:\n\n```bash\ndocker logs netdata 2>&1 | grep smartctl\n```\n\n", "alerts": "## Alerts\n\nThere are no alerts configured by default for this integration.\n", "metrics": "## Metrics\n\nMetrics grouped by *scope*.\n\nThe scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels.\n\n\n\n### Per controller\n\nThese metrics refer to the Storage Device.\n\nLabels:\n\n| Label | Description |\n|:-----------|:----------------|\n| device_name | Device name |\n| device_type | Device type |\n| model_name | Model name |\n| serial_number | Serial number |\n\nMetrics:\n\n| Metric | Dimensions | Unit |\n|:------|:----------|:----|\n| smartctl.device_smart_status | passed, failed | status |\n| smartctl.device_ata_smart_error_log_count | error_log | logs |\n| smartctl.device_power_on_time | power_on_time | seconds |\n| smartctl.device_temperature | temperature | Celsius |\n| smartctl.device_power_cycles_count | power | cycles |\n| smartctl.device_read_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_write_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_verify_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_smart_attr_{attribute_name} | {attribute_name} | {attribute_unit} |\n| smartctl.device_smart_attr_{attribute_name}_normalized | {attribute_name} | value |\n\n", diff --git a/integrations/integrations.json b/integrations/integrations.json index 1272e22e927a9c..0ab1a068453469 100644 --- a/integrations/integrations.json +++ b/integrations/integrations.json @@ -16100,7 +16100,7 @@ "most_popular": false }, "overview": "# S.M.A.R.T.\n\nPlugin: go.d.plugin\nModule: smartctl\n\n## Overview\n\nThis collector monitors the health status of storage devices by analyzing S.M.A.R.T. (Self-Monitoring, Analysis, and Reporting Technology) counters.\nIt relies on the [`smartctl`](https://linux.die.net/man/8/smartctl) CLI tool but avoids directly executing the binary.\nInstead, it utilizes `ndsudo`, a Netdata helper specifically designed to run privileged commands securely within the Netdata environment.\nThis approach eliminates the need to use `sudo`, improving security and potentially simplifying permission management.\n\nExecuted commands:\n- `smartctl --json --scan`\n- `smartctl --json --all {deviceName} --device {deviceType} --nocheck {powerMode}`\n\n\n\n\nThis collector is supported on all platforms.\n\nThis collector only supports collecting metrics from a single instance of this integration.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nThis integration doesn't support auto-detection.\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", - "setup": "## Setup\n\n### Prerequisites\n\n#### Install smartmontools (v7.0+)\n\nInstall `smartmontools` version 7.0 or later using your distribution's package manager. Version 7.0 introduced the `--json` output mode, which is required for this collector to function properly.\n\n\n#### For Netdata running in a Docker container\n\nNetdata requires the `SYS_RAWIO` capability and access to the storage devices to run the `smartctl` collector inside a Docker container. Here's how you can achieve this:\n\n- `docker run`\n\n ```bash\n docker run --cap-add SYS_RAWIO --device /dev/sda:/dev/sda ...\n ```\n\n- `docker-compose.yml`\n\n ```yaml\n services:\n netdata:\n cap_add:\n - SYS_PTRACE\n - SYS_ADMIN\n - SYS_RAWIO # smartctl\n devices:\n - \"/dev/sda:/dev/sda\"\n ```\n\n> **Multiple Devices**: These examples only show mapping of one device (/dev/sda). You'll need to add additional `--device` options (in docker run) or entries in the `devices` list (in docker-compose.yml) for each storage device you want Netdata's smartctl collector to monitor.\n\n> **NVMe Devices**: Do not map NVMe devices using this method. Netdata uses a [dedicated collector](https://github.com/netdata/netdata/tree/master/src/go/plugin/go.d/modules/nvme#readme) to monitor NVMe devices.\n\n\n\n### Configuration\n\n#### File\n\nThe configuration file name for this integration is `go.d/smartctl.conf`.\n\n\nYou can edit the configuration file using the `edit-config` script from the\nNetdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).\n\n```bash\ncd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata\nsudo ./edit-config go.d/smartctl.conf\n```\n#### Options\n\nThe following options can be defined globally: update_every.\n\n\n| Name | Description | Default | Required |\n|:----|:-----------|:-------|:--------:|\n| update_every | interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**. | 10 | no |\n| timeout | smartctl binary execution timeout. | 5 | no |\n| scan_every | interval for discovering new devices using `smartctl --scan`, measured in seconds. | 900 | no |\n| poll_devices_every | interval for gathering data for every device, measured in seconds. Data is cached for this interval. | 300 | no |\n| device_selector | Specifies a pattern to match the 'info name' of devices as reported by `smartctl --scan --json`. | * | no |\n| no_check_power_mode | Skip data collection when the device is in a low-power mode. Prevents unnecessary disk spin-up. | standby | no |\n\n##### no_check_power_mode\n\nThe valid arguments to this option are:\n\n| Mode | Description |\n|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| never | Check the device always. |\n| sleep | Check the device unless it is in SLEEP mode. |\n| standby | Check the device unless it is in SLEEP or STANDBY mode. In these modes most disks are not spinning, so if you want to prevent a disk from spinning up, this is probably what you want. |\n| idle | Check the device unless it is in SLEEP, STANDBY or IDLE mode. In the IDLE state, most disks are still spinning, so this is probably not what you want. |\n\n\n#### Examples\n\n##### Custom devices poll interval\n\nAllows you to override the default devices poll interval (data collection).\n\n```yaml\njobs:\n - name: smartctl\n devices_poll_interval: 60 # Collect S.M.A.R.T statistics every 60 seconds\n\n```\n", + "setup": "## Setup\n\n### Prerequisites\n\n#### Install smartmontools (v7.0+)\n\nInstall `smartmontools` version 7.0 or later using your distribution's package manager. Version 7.0 introduced the `--json` output mode, which is required for this collector to function properly.\n\n\n#### For Netdata running in a Docker container\n\nNetdata requires the `SYS_RAWIO` capability and access to the storage devices to run the `smartctl` collector inside a Docker container. Here's how you can achieve this:\n\n- `docker run`\n\n ```bash\n docker run --cap-add SYS_RAWIO --device /dev/sda:/dev/sda ...\n ```\n\n- `docker-compose.yml`\n\n ```yaml\n services:\n netdata:\n cap_add:\n - SYS_PTRACE\n - SYS_ADMIN\n - SYS_RAWIO # smartctl\n devices:\n - \"/dev/sda:/dev/sda\"\n ```\n\n> **Multiple Devices**: These examples only show mapping of one device (/dev/sda). You'll need to add additional `--device` options (in docker run) or entries in the `devices` list (in docker-compose.yml) for each storage device you want Netdata's smartctl collector to monitor.\n\n> **NVMe Devices**: Do not map NVMe devices using this method. Netdata uses a [dedicated collector](https://github.com/netdata/netdata/tree/master/src/go/plugin/go.d/modules/nvme#readme) to monitor NVMe devices.\n\n\n\n### Configuration\n\n#### File\n\nThe configuration file name for this integration is `go.d/smartctl.conf`.\n\n\nYou can edit the configuration file using the `edit-config` script from the\nNetdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).\n\n```bash\ncd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata\nsudo ./edit-config go.d/smartctl.conf\n```\n#### Options\n\nThe following options can be defined globally: update_every.\n\n\n| Name | Description | Default | Required |\n|:----|:-----------|:-------|:--------:|\n| update_every | interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**. | 10 | no |\n| timeout | smartctl binary execution timeout. | 5 | no |\n| scan_every | interval for discovering new devices using `smartctl --scan`, measured in seconds. | 900 | no |\n| poll_devices_every | interval for gathering data for every device, measured in seconds. Data is cached for this interval. | 300 | no |\n| device_selector | Specifies a pattern to match the 'info name' of devices as reported by `smartctl --scan --json`. | * | no |\n| extra_devices | Allows manual specification of devices not automatically detected by `smartctl --scan`. Each device entry must include both a name and a type. See \"Configuration Examples\" for details. | [] | no |\n| no_check_power_mode | Skip data collection when the device is in a low-power mode. Prevents unnecessary disk spin-up. | standby | no |\n\n##### no_check_power_mode\n\nThe valid arguments to this option are:\n\n| Mode | Description |\n|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| never | Check the device always. |\n| sleep | Check the device unless it is in SLEEP mode. |\n| standby | Check the device unless it is in SLEEP or STANDBY mode. In these modes most disks are not spinning, so if you want to prevent a disk from spinning up, this is probably what you want. |\n| idle | Check the device unless it is in SLEEP, STANDBY or IDLE mode. In the IDLE state, most disks are still spinning, so this is probably not what you want. |\n\n\n#### Examples\n\n##### Custom devices poll interval\n\nAllows you to override the default devices poll interval (data collection).\n\n```yaml\njobs:\n - name: smartctl\n devices_poll_interval: 60 # Collect S.M.A.R.T statistics every 60 seconds\n\n```\n##### Extra devices\n\nThis example demonstrates using `extra_devices` to manually add a storage device (`/dev/sdc`) not automatically detected by `smartctl --scan`.\n\n\n```yaml\njobs:\n - name: smartctl\n extra_devices:\n - name: /dev/sdc\n type: jmb39x-q,3\n\n```\n", "troubleshooting": "## Troubleshooting\n\n### Debug Mode\n\nTo troubleshoot issues with the `smartctl` collector, run the `go.d.plugin` with the debug option enabled. The output\nshould give you clues as to why the collector isn't working.\n\n- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on\n your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`.\n\n ```bash\n cd /usr/libexec/netdata/plugins.d/\n ```\n\n- Switch to the `netdata` user.\n\n ```bash\n sudo -u netdata -s\n ```\n\n- Run the `go.d.plugin` to debug the collector:\n\n ```bash\n ./go.d.plugin -d -m smartctl\n ```\n\n### Getting Logs\n\nIf you're encountering problems with the `smartctl` collector, follow these steps to retrieve logs and identify potential issues:\n\n- **Run the command** specific to your system (systemd, non-systemd, or Docker container).\n- **Examine the output** for any warnings or error messages that might indicate issues. These messages should provide clues about the root cause of the problem.\n\n#### System with systemd\n\nUse the following command to view logs generated since the last Netdata service restart:\n\n```bash\njournalctl _SYSTEMD_INVOCATION_ID=\"$(systemctl show --value --property=InvocationID netdata)\" --namespace=netdata --grep smartctl\n```\n\n#### System without systemd\n\nLocate the collector log file, typically at `/var/log/netdata/collector.log`, and use `grep` to filter for collector's name:\n\n```bash\ngrep smartctl /var/log/netdata/collector.log\n```\n\n**Note**: This method shows logs from all restarts. Focus on the **latest entries** for troubleshooting current issues.\n\n#### Docker Container\n\nIf your Netdata runs in a Docker container named \"netdata\" (replace if different), use this command:\n\n```bash\ndocker logs netdata 2>&1 | grep smartctl\n```\n\n", "alerts": "## Alerts\n\nThere are no alerts configured by default for this integration.\n", "metrics": "## Metrics\n\nMetrics grouped by *scope*.\n\nThe scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels.\n\n\n\n### Per controller\n\nThese metrics refer to the Storage Device.\n\nLabels:\n\n| Label | Description |\n|:-----------|:----------------|\n| device_name | Device name |\n| device_type | Device type |\n| model_name | Model name |\n| serial_number | Serial number |\n\nMetrics:\n\n| Metric | Dimensions | Unit |\n|:------|:----------|:----|\n| smartctl.device_smart_status | passed, failed | status |\n| smartctl.device_ata_smart_error_log_count | error_log | logs |\n| smartctl.device_power_on_time | power_on_time | seconds |\n| smartctl.device_temperature | temperature | Celsius |\n| smartctl.device_power_cycles_count | power | cycles |\n| smartctl.device_read_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_write_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_verify_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_smart_attr_{attribute_name} | {attribute_name} | {attribute_unit} |\n| smartctl.device_smart_attr_{attribute_name}_normalized | {attribute_name} | value |\n\n", diff --git a/src/go/plugin/go.d/modules/smartctl/integrations/s.m.a.r.t..md b/src/go/plugin/go.d/modules/smartctl/integrations/s.m.a.r.t..md index 1b893b488a2750..45aa9526802eb5 100644 --- a/src/go/plugin/go.d/modules/smartctl/integrations/s.m.a.r.t..md +++ b/src/go/plugin/go.d/modules/smartctl/integrations/s.m.a.r.t..md @@ -162,6 +162,7 @@ The following options can be defined globally: update_every. | scan_every | interval for discovering new devices using `smartctl --scan`, measured in seconds. | 900 | no | | poll_devices_every | interval for gathering data for every device, measured in seconds. Data is cached for this interval. | 300 | no | | device_selector | Specifies a pattern to match the 'info name' of devices as reported by `smartctl --scan --json`. | * | no | +| extra_devices | Allows manual specification of devices not automatically detected by `smartctl --scan`. Each device entry must include both a name and a type. See "Configuration Examples" for details. | [] | no | | no_check_power_mode | Skip data collection when the device is in a low-power mode. Prevents unnecessary disk spin-up. | standby | no | ##### no_check_power_mode @@ -194,6 +195,23 @@ jobs: ``` +##### Extra devices + +This example demonstrates using `extra_devices` to manually add a storage device (`/dev/sdc`) not automatically detected by `smartctl --scan`. + + +
Config + +```yaml +jobs: + - name: smartctl + extra_devices: + - name: /dev/sdc + type: jmb39x-q,3 + +``` +
+ ## Troubleshooting From e854c04a2ded7437dac6ed6d3d8dbb93b4dc1a05 Mon Sep 17 00:00:00 2001 From: netdatabot Date: Sun, 14 Jul 2024 00:18:51 +0000 Subject: [PATCH 13/18] [ci skip] Update changelog and version for nightly build: v1.46.0-152-nightly. --- CHANGELOG.md | 4 ++-- packaging/version | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b496425372a03..a451c9299fac4d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ **Merged pull requests:** +- Regenerate integrations.js [\#18141](https://github.com/netdata/netdata/pull/18141) ([netdatabot](https://github.com/netdatabot)) +- go.d smartctl add "extra\_devices" option [\#18140](https://github.com/netdata/netdata/pull/18140) ([ilyam8](https://github.com/ilyam8)) - Spawn server fixes 6 [\#18136](https://github.com/netdata/netdata/pull/18136) ([ktsaou](https://github.com/ktsaou)) - Regenerate integrations.js [\#18135](https://github.com/netdata/netdata/pull/18135) ([netdatabot](https://github.com/netdatabot)) - docs: go.d mysql: remove unix sockets from auto\_detection [\#18134](https://github.com/netdata/netdata/pull/18134) ([ilyam8](https://github.com/ilyam8)) @@ -410,8 +412,6 @@ - Fix invalid item in postinst script for Netdata package. [\#17580](https://github.com/netdata/netdata/pull/17580) ([Ferroin](https://github.com/Ferroin)) - Regenerate integrations.js [\#17578](https://github.com/netdata/netdata/pull/17578) ([netdatabot](https://github.com/netdatabot)) - Cpack fixes [\#17576](https://github.com/netdata/netdata/pull/17576) ([vkalintiris](https://github.com/vkalintiris)) -- Fix compilation without `dbengine` [\#17575](https://github.com/netdata/netdata/pull/17575) ([thiagoftsm](https://github.com/thiagoftsm)) -- Fix handling of netdata.conf on install in build system. [\#17572](https://github.com/netdata/netdata/pull/17572) ([Ferroin](https://github.com/Ferroin)) ## [v1.45.6](https://github.com/netdata/netdata/tree/v1.45.6) (2024-06-05) diff --git a/packaging/version b/packaging/version index f0f7e39dd06c0a..c31ee995c07ab9 100644 --- a/packaging/version +++ b/packaging/version @@ -1 +1 @@ -v1.46.0-149-nightly +v1.46.0-152-nightly From fa4d1509d8f0efe7d85ec5f5f4e05e5e66fe208a Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Sun, 14 Jul 2024 22:01:28 +0300 Subject: [PATCH 14/18] ndsudo add smartctl scan-open (#18143) --- src/collectors/plugins.d/ndsudo.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/collectors/plugins.d/ndsudo.c b/src/collectors/plugins.d/ndsudo.c index cc8619bffda583..29cf52e4b21047 100644 --- a/src/collectors/plugins.d/ndsudo.c +++ b/src/collectors/plugins.d/ndsudo.c @@ -45,6 +45,14 @@ struct command { [1] = NULL, }, }, + { + .name = "smartctl-json-scan-open", + .params = "--json --scan-open", + .search = { + [0] = "smartctl", + [1] = NULL, + }, + }, { .name = "smartctl-json-device-info", .params = "--json --all {{deviceName}} --device {{deviceType}} --nocheck {{powerMode}}", From 7cb342c77ead000339e7d9284d61f65bc41ecc48 Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Sun, 14 Jul 2024 22:02:34 +0300 Subject: [PATCH 15/18] go.d smartctl: do scan only once on startup if interval is 0 (#18144) --- src/go/plugin/go.d/modules/smartctl/collect.go | 2 +- .../go.d/modules/smartctl/config_schema.json | 2 +- src/go/plugin/go.d/modules/smartctl/metadata.yaml | 2 +- src/go/plugin/go.d/modules/smartctl/scan.go | 14 +++++++++----- src/go/plugin/go.d/modules/smartctl/smartctl.go | 1 + 5 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/go/plugin/go.d/modules/smartctl/collect.go b/src/go/plugin/go.d/modules/smartctl/collect.go index e8f8357d680c56..35585db62f3ae9 100644 --- a/src/go/plugin/go.d/modules/smartctl/collect.go +++ b/src/go/plugin/go.d/modules/smartctl/collect.go @@ -164,7 +164,7 @@ func (s *Smartctl) collectSmartDevice(mx map[string]int64, dev *smartDevice) { } func (s *Smartctl) isTimeToScan(now time.Time) bool { - return now.After(s.lastScanTime.Add(s.ScanEvery.Duration())) + return s.ScanEvery.Duration().Seconds() != 0 && now.After(s.lastScanTime.Add(s.ScanEvery.Duration())) } func (s *Smartctl) isTimeToPollDevices(now time.Time) bool { diff --git a/src/go/plugin/go.d/modules/smartctl/config_schema.json b/src/go/plugin/go.d/modules/smartctl/config_schema.json index e03f8081da0950..afe7ce1a9fc788 100644 --- a/src/go/plugin/go.d/modules/smartctl/config_schema.json +++ b/src/go/plugin/go.d/modules/smartctl/config_schema.json @@ -20,7 +20,7 @@ }, "scan_every": { "title": "Scan interval", - "description": "Interval for discovering new devices using `smartctl --scan`, measured in seconds.", + "description": "Interval for discovering new devices using `smartctl --scan`, measured in seconds. Set to 0 to scan devices only once on startup.", "type": "number", "minimum": 1, "default": 900 diff --git a/src/go/plugin/go.d/modules/smartctl/metadata.yaml b/src/go/plugin/go.d/modules/smartctl/metadata.yaml index 0b54f69fbecf90..9293c25419cdf0 100644 --- a/src/go/plugin/go.d/modules/smartctl/metadata.yaml +++ b/src/go/plugin/go.d/modules/smartctl/metadata.yaml @@ -97,7 +97,7 @@ modules: default_value: 5 required: false - name: scan_every - description: interval for discovering new devices using `smartctl --scan`, measured in seconds. + description: interval for discovering new devices using `smartctl --scan`, measured in seconds. Set to 0 to scan devices only once on startup. default_value: 900 required: false - name: poll_devices_every diff --git a/src/go/plugin/go.d/modules/smartctl/scan.go b/src/go/plugin/go.d/modules/smartctl/scan.go index e4291be4f91889..06c8cdcb7fa6a0 100644 --- a/src/go/plugin/go.d/modules/smartctl/scan.go +++ b/src/go/plugin/go.d/modules/smartctl/scan.go @@ -53,11 +53,15 @@ func (s *Smartctl) scanDevices() (map[string]*scanDevice, error) { // Accurate device type information is crucial because we use the `--device` option to gather data. // Using the wrong type can lead to issues. // For example, using 'scsi' for 'sat' devices prevents `smartctl` from issuing the necessary ATA commands. - resp, _ := s.exec.deviceInfo(dev.name, dev.typ, s.NoCheckPowerMode) - if resp != nil && isExitStatusHasBit(resp, 2) { - correctType := "sat" - s.Debugf("changing device '%s' type '%s' -> '%s'", dev.name, dev.typ, correctType) - dev.typ = correctType + d := scanDevice{name: dev.name, typ: "sat"} + if _, ok := s.scannedDevices[d.key()]; ok { + dev.typ = "sat" + } else { + resp, _ := s.exec.deviceInfo(dev.name, dev.typ, s.NoCheckPowerMode) + if resp != nil && isExitStatusHasBit(resp, 2) { + s.Debugf("changing device '%s' type 'scsi' -> 'sat'", dev.name) + dev.typ = "sat" + } } } diff --git a/src/go/plugin/go.d/modules/smartctl/smartctl.go b/src/go/plugin/go.d/modules/smartctl/smartctl.go index 1ea1a8fbaf8cc0..0ba0491794f649 100644 --- a/src/go/plugin/go.d/modules/smartctl/smartctl.go +++ b/src/go/plugin/go.d/modules/smartctl/smartctl.go @@ -38,6 +38,7 @@ func New() *Smartctl { DeviceSelector: "*", }, charts: &module.Charts{}, + forceScan: true, deviceSr: matcher.TRUE(), seenDevices: make(map[string]bool), } From cb126dcdd3ca12023e15f727f8f312de3e096e9a Mon Sep 17 00:00:00 2001 From: Netdata bot <43409846+netdatabot@users.noreply.github.com> Date: Sun, 14 Jul 2024 15:22:27 -0400 Subject: [PATCH 16/18] Regenerate integrations.js (#18145) Co-authored-by: ilyam8 <22274335+ilyam8@users.noreply.github.com> --- integrations/integrations.js | 2 +- integrations/integrations.json | 2 +- src/go/plugin/go.d/modules/smartctl/integrations/s.m.a.r.t..md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/integrations/integrations.js b/integrations/integrations.js index 1ae51ee44fa91d..27af1cdc3bd288 100644 --- a/integrations/integrations.js +++ b/integrations/integrations.js @@ -16102,7 +16102,7 @@ export const integrations = [ "most_popular": false }, "overview": "# S.M.A.R.T.\n\nPlugin: go.d.plugin\nModule: smartctl\n\n## Overview\n\nThis collector monitors the health status of storage devices by analyzing S.M.A.R.T. (Self-Monitoring, Analysis, and Reporting Technology) counters.\nIt relies on the [`smartctl`](https://linux.die.net/man/8/smartctl) CLI tool but avoids directly executing the binary.\nInstead, it utilizes `ndsudo`, a Netdata helper specifically designed to run privileged commands securely within the Netdata environment.\nThis approach eliminates the need to use `sudo`, improving security and potentially simplifying permission management.\n\nExecuted commands:\n- `smartctl --json --scan`\n- `smartctl --json --all {deviceName} --device {deviceType} --nocheck {powerMode}`\n\n\n\n\nThis collector is supported on all platforms.\n\nThis collector only supports collecting metrics from a single instance of this integration.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nThis integration doesn't support auto-detection.\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", - "setup": "## Setup\n\n### Prerequisites\n\n#### Install smartmontools (v7.0+)\n\nInstall `smartmontools` version 7.0 or later using your distribution's package manager. Version 7.0 introduced the `--json` output mode, which is required for this collector to function properly.\n\n\n#### For Netdata running in a Docker container\n\nNetdata requires the `SYS_RAWIO` capability and access to the storage devices to run the `smartctl` collector inside a Docker container. Here's how you can achieve this:\n\n- `docker run`\n\n ```bash\n docker run --cap-add SYS_RAWIO --device /dev/sda:/dev/sda ...\n ```\n\n- `docker-compose.yml`\n\n ```yaml\n services:\n netdata:\n cap_add:\n - SYS_PTRACE\n - SYS_ADMIN\n - SYS_RAWIO # smartctl\n devices:\n - \"/dev/sda:/dev/sda\"\n ```\n\n> **Multiple Devices**: These examples only show mapping of one device (/dev/sda). You'll need to add additional `--device` options (in docker run) or entries in the `devices` list (in docker-compose.yml) for each storage device you want Netdata's smartctl collector to monitor.\n\n> **NVMe Devices**: Do not map NVMe devices using this method. Netdata uses a [dedicated collector](https://github.com/netdata/netdata/tree/master/src/go/plugin/go.d/modules/nvme#readme) to monitor NVMe devices.\n\n\n\n### Configuration\n\n#### File\n\nThe configuration file name for this integration is `go.d/smartctl.conf`.\n\n\nYou can edit the configuration file using the `edit-config` script from the\nNetdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).\n\n```bash\ncd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata\nsudo ./edit-config go.d/smartctl.conf\n```\n#### Options\n\nThe following options can be defined globally: update_every.\n\n\n{% details open=true summary=\"Config options\" %}\n| Name | Description | Default | Required |\n|:----|:-----------|:-------|:--------:|\n| update_every | interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**. | 10 | no |\n| timeout | smartctl binary execution timeout. | 5 | no |\n| scan_every | interval for discovering new devices using `smartctl --scan`, measured in seconds. | 900 | no |\n| poll_devices_every | interval for gathering data for every device, measured in seconds. Data is cached for this interval. | 300 | no |\n| device_selector | Specifies a pattern to match the 'info name' of devices as reported by `smartctl --scan --json`. | * | no |\n| extra_devices | Allows manual specification of devices not automatically detected by `smartctl --scan`. Each device entry must include both a name and a type. See \"Configuration Examples\" for details. | [] | no |\n| no_check_power_mode | Skip data collection when the device is in a low-power mode. Prevents unnecessary disk spin-up. | standby | no |\n\n##### no_check_power_mode\n\nThe valid arguments to this option are:\n\n| Mode | Description |\n|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| never | Check the device always. |\n| sleep | Check the device unless it is in SLEEP mode. |\n| standby | Check the device unless it is in SLEEP or STANDBY mode. In these modes most disks are not spinning, so if you want to prevent a disk from spinning up, this is probably what you want. |\n| idle | Check the device unless it is in SLEEP, STANDBY or IDLE mode. In the IDLE state, most disks are still spinning, so this is probably not what you want. |\n\n\n{% /details %}\n#### Examples\n\n##### Custom devices poll interval\n\nAllows you to override the default devices poll interval (data collection).\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: smartctl\n devices_poll_interval: 60 # Collect S.M.A.R.T statistics every 60 seconds\n\n```\n{% /details %}\n##### Extra devices\n\nThis example demonstrates using `extra_devices` to manually add a storage device (`/dev/sdc`) not automatically detected by `smartctl --scan`.\n\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: smartctl\n extra_devices:\n - name: /dev/sdc\n type: jmb39x-q,3\n\n```\n{% /details %}\n", + "setup": "## Setup\n\n### Prerequisites\n\n#### Install smartmontools (v7.0+)\n\nInstall `smartmontools` version 7.0 or later using your distribution's package manager. Version 7.0 introduced the `--json` output mode, which is required for this collector to function properly.\n\n\n#### For Netdata running in a Docker container\n\nNetdata requires the `SYS_RAWIO` capability and access to the storage devices to run the `smartctl` collector inside a Docker container. Here's how you can achieve this:\n\n- `docker run`\n\n ```bash\n docker run --cap-add SYS_RAWIO --device /dev/sda:/dev/sda ...\n ```\n\n- `docker-compose.yml`\n\n ```yaml\n services:\n netdata:\n cap_add:\n - SYS_PTRACE\n - SYS_ADMIN\n - SYS_RAWIO # smartctl\n devices:\n - \"/dev/sda:/dev/sda\"\n ```\n\n> **Multiple Devices**: These examples only show mapping of one device (/dev/sda). You'll need to add additional `--device` options (in docker run) or entries in the `devices` list (in docker-compose.yml) for each storage device you want Netdata's smartctl collector to monitor.\n\n> **NVMe Devices**: Do not map NVMe devices using this method. Netdata uses a [dedicated collector](https://github.com/netdata/netdata/tree/master/src/go/plugin/go.d/modules/nvme#readme) to monitor NVMe devices.\n\n\n\n### Configuration\n\n#### File\n\nThe configuration file name for this integration is `go.d/smartctl.conf`.\n\n\nYou can edit the configuration file using the `edit-config` script from the\nNetdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).\n\n```bash\ncd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata\nsudo ./edit-config go.d/smartctl.conf\n```\n#### Options\n\nThe following options can be defined globally: update_every.\n\n\n{% details open=true summary=\"Config options\" %}\n| Name | Description | Default | Required |\n|:----|:-----------|:-------|:--------:|\n| update_every | interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**. | 10 | no |\n| timeout | smartctl binary execution timeout. | 5 | no |\n| scan_every | interval for discovering new devices using `smartctl --scan`, measured in seconds. Set to 0 to scan devices only once on startup. | 900 | no |\n| poll_devices_every | interval for gathering data for every device, measured in seconds. Data is cached for this interval. | 300 | no |\n| device_selector | Specifies a pattern to match the 'info name' of devices as reported by `smartctl --scan --json`. | * | no |\n| extra_devices | Allows manual specification of devices not automatically detected by `smartctl --scan`. Each device entry must include both a name and a type. See \"Configuration Examples\" for details. | [] | no |\n| no_check_power_mode | Skip data collection when the device is in a low-power mode. Prevents unnecessary disk spin-up. | standby | no |\n\n##### no_check_power_mode\n\nThe valid arguments to this option are:\n\n| Mode | Description |\n|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| never | Check the device always. |\n| sleep | Check the device unless it is in SLEEP mode. |\n| standby | Check the device unless it is in SLEEP or STANDBY mode. In these modes most disks are not spinning, so if you want to prevent a disk from spinning up, this is probably what you want. |\n| idle | Check the device unless it is in SLEEP, STANDBY or IDLE mode. In the IDLE state, most disks are still spinning, so this is probably not what you want. |\n\n\n{% /details %}\n#### Examples\n\n##### Custom devices poll interval\n\nAllows you to override the default devices poll interval (data collection).\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: smartctl\n devices_poll_interval: 60 # Collect S.M.A.R.T statistics every 60 seconds\n\n```\n{% /details %}\n##### Extra devices\n\nThis example demonstrates using `extra_devices` to manually add a storage device (`/dev/sdc`) not automatically detected by `smartctl --scan`.\n\n\n{% details open=true summary=\"Config\" %}\n```yaml\njobs:\n - name: smartctl\n extra_devices:\n - name: /dev/sdc\n type: jmb39x-q,3\n\n```\n{% /details %}\n", "troubleshooting": "## Troubleshooting\n\n### Debug Mode\n\nTo troubleshoot issues with the `smartctl` collector, run the `go.d.plugin` with the debug option enabled. The output\nshould give you clues as to why the collector isn't working.\n\n- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on\n your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`.\n\n ```bash\n cd /usr/libexec/netdata/plugins.d/\n ```\n\n- Switch to the `netdata` user.\n\n ```bash\n sudo -u netdata -s\n ```\n\n- Run the `go.d.plugin` to debug the collector:\n\n ```bash\n ./go.d.plugin -d -m smartctl\n ```\n\n### Getting Logs\n\nIf you're encountering problems with the `smartctl` collector, follow these steps to retrieve logs and identify potential issues:\n\n- **Run the command** specific to your system (systemd, non-systemd, or Docker container).\n- **Examine the output** for any warnings or error messages that might indicate issues. These messages should provide clues about the root cause of the problem.\n\n#### System with systemd\n\nUse the following command to view logs generated since the last Netdata service restart:\n\n```bash\njournalctl _SYSTEMD_INVOCATION_ID=\"$(systemctl show --value --property=InvocationID netdata)\" --namespace=netdata --grep smartctl\n```\n\n#### System without systemd\n\nLocate the collector log file, typically at `/var/log/netdata/collector.log`, and use `grep` to filter for collector's name:\n\n```bash\ngrep smartctl /var/log/netdata/collector.log\n```\n\n**Note**: This method shows logs from all restarts. Focus on the **latest entries** for troubleshooting current issues.\n\n#### Docker Container\n\nIf your Netdata runs in a Docker container named \"netdata\" (replace if different), use this command:\n\n```bash\ndocker logs netdata 2>&1 | grep smartctl\n```\n\n", "alerts": "## Alerts\n\nThere are no alerts configured by default for this integration.\n", "metrics": "## Metrics\n\nMetrics grouped by *scope*.\n\nThe scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels.\n\n\n\n### Per controller\n\nThese metrics refer to the Storage Device.\n\nLabels:\n\n| Label | Description |\n|:-----------|:----------------|\n| device_name | Device name |\n| device_type | Device type |\n| model_name | Model name |\n| serial_number | Serial number |\n\nMetrics:\n\n| Metric | Dimensions | Unit |\n|:------|:----------|:----|\n| smartctl.device_smart_status | passed, failed | status |\n| smartctl.device_ata_smart_error_log_count | error_log | logs |\n| smartctl.device_power_on_time | power_on_time | seconds |\n| smartctl.device_temperature | temperature | Celsius |\n| smartctl.device_power_cycles_count | power | cycles |\n| smartctl.device_read_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_write_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_verify_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_smart_attr_{attribute_name} | {attribute_name} | {attribute_unit} |\n| smartctl.device_smart_attr_{attribute_name}_normalized | {attribute_name} | value |\n\n", diff --git a/integrations/integrations.json b/integrations/integrations.json index 0ab1a068453469..2c9178afca259e 100644 --- a/integrations/integrations.json +++ b/integrations/integrations.json @@ -16100,7 +16100,7 @@ "most_popular": false }, "overview": "# S.M.A.R.T.\n\nPlugin: go.d.plugin\nModule: smartctl\n\n## Overview\n\nThis collector monitors the health status of storage devices by analyzing S.M.A.R.T. (Self-Monitoring, Analysis, and Reporting Technology) counters.\nIt relies on the [`smartctl`](https://linux.die.net/man/8/smartctl) CLI tool but avoids directly executing the binary.\nInstead, it utilizes `ndsudo`, a Netdata helper specifically designed to run privileged commands securely within the Netdata environment.\nThis approach eliminates the need to use `sudo`, improving security and potentially simplifying permission management.\n\nExecuted commands:\n- `smartctl --json --scan`\n- `smartctl --json --all {deviceName} --device {deviceType} --nocheck {powerMode}`\n\n\n\n\nThis collector is supported on all platforms.\n\nThis collector only supports collecting metrics from a single instance of this integration.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nThis integration doesn't support auto-detection.\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", - "setup": "## Setup\n\n### Prerequisites\n\n#### Install smartmontools (v7.0+)\n\nInstall `smartmontools` version 7.0 or later using your distribution's package manager. Version 7.0 introduced the `--json` output mode, which is required for this collector to function properly.\n\n\n#### For Netdata running in a Docker container\n\nNetdata requires the `SYS_RAWIO` capability and access to the storage devices to run the `smartctl` collector inside a Docker container. Here's how you can achieve this:\n\n- `docker run`\n\n ```bash\n docker run --cap-add SYS_RAWIO --device /dev/sda:/dev/sda ...\n ```\n\n- `docker-compose.yml`\n\n ```yaml\n services:\n netdata:\n cap_add:\n - SYS_PTRACE\n - SYS_ADMIN\n - SYS_RAWIO # smartctl\n devices:\n - \"/dev/sda:/dev/sda\"\n ```\n\n> **Multiple Devices**: These examples only show mapping of one device (/dev/sda). You'll need to add additional `--device` options (in docker run) or entries in the `devices` list (in docker-compose.yml) for each storage device you want Netdata's smartctl collector to monitor.\n\n> **NVMe Devices**: Do not map NVMe devices using this method. Netdata uses a [dedicated collector](https://github.com/netdata/netdata/tree/master/src/go/plugin/go.d/modules/nvme#readme) to monitor NVMe devices.\n\n\n\n### Configuration\n\n#### File\n\nThe configuration file name for this integration is `go.d/smartctl.conf`.\n\n\nYou can edit the configuration file using the `edit-config` script from the\nNetdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).\n\n```bash\ncd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata\nsudo ./edit-config go.d/smartctl.conf\n```\n#### Options\n\nThe following options can be defined globally: update_every.\n\n\n| Name | Description | Default | Required |\n|:----|:-----------|:-------|:--------:|\n| update_every | interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**. | 10 | no |\n| timeout | smartctl binary execution timeout. | 5 | no |\n| scan_every | interval for discovering new devices using `smartctl --scan`, measured in seconds. | 900 | no |\n| poll_devices_every | interval for gathering data for every device, measured in seconds. Data is cached for this interval. | 300 | no |\n| device_selector | Specifies a pattern to match the 'info name' of devices as reported by `smartctl --scan --json`. | * | no |\n| extra_devices | Allows manual specification of devices not automatically detected by `smartctl --scan`. Each device entry must include both a name and a type. See \"Configuration Examples\" for details. | [] | no |\n| no_check_power_mode | Skip data collection when the device is in a low-power mode. Prevents unnecessary disk spin-up. | standby | no |\n\n##### no_check_power_mode\n\nThe valid arguments to this option are:\n\n| Mode | Description |\n|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| never | Check the device always. |\n| sleep | Check the device unless it is in SLEEP mode. |\n| standby | Check the device unless it is in SLEEP or STANDBY mode. In these modes most disks are not spinning, so if you want to prevent a disk from spinning up, this is probably what you want. |\n| idle | Check the device unless it is in SLEEP, STANDBY or IDLE mode. In the IDLE state, most disks are still spinning, so this is probably not what you want. |\n\n\n#### Examples\n\n##### Custom devices poll interval\n\nAllows you to override the default devices poll interval (data collection).\n\n```yaml\njobs:\n - name: smartctl\n devices_poll_interval: 60 # Collect S.M.A.R.T statistics every 60 seconds\n\n```\n##### Extra devices\n\nThis example demonstrates using `extra_devices` to manually add a storage device (`/dev/sdc`) not automatically detected by `smartctl --scan`.\n\n\n```yaml\njobs:\n - name: smartctl\n extra_devices:\n - name: /dev/sdc\n type: jmb39x-q,3\n\n```\n", + "setup": "## Setup\n\n### Prerequisites\n\n#### Install smartmontools (v7.0+)\n\nInstall `smartmontools` version 7.0 or later using your distribution's package manager. Version 7.0 introduced the `--json` output mode, which is required for this collector to function properly.\n\n\n#### For Netdata running in a Docker container\n\nNetdata requires the `SYS_RAWIO` capability and access to the storage devices to run the `smartctl` collector inside a Docker container. Here's how you can achieve this:\n\n- `docker run`\n\n ```bash\n docker run --cap-add SYS_RAWIO --device /dev/sda:/dev/sda ...\n ```\n\n- `docker-compose.yml`\n\n ```yaml\n services:\n netdata:\n cap_add:\n - SYS_PTRACE\n - SYS_ADMIN\n - SYS_RAWIO # smartctl\n devices:\n - \"/dev/sda:/dev/sda\"\n ```\n\n> **Multiple Devices**: These examples only show mapping of one device (/dev/sda). You'll need to add additional `--device` options (in docker run) or entries in the `devices` list (in docker-compose.yml) for each storage device you want Netdata's smartctl collector to monitor.\n\n> **NVMe Devices**: Do not map NVMe devices using this method. Netdata uses a [dedicated collector](https://github.com/netdata/netdata/tree/master/src/go/plugin/go.d/modules/nvme#readme) to monitor NVMe devices.\n\n\n\n### Configuration\n\n#### File\n\nThe configuration file name for this integration is `go.d/smartctl.conf`.\n\n\nYou can edit the configuration file using the `edit-config` script from the\nNetdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).\n\n```bash\ncd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata\nsudo ./edit-config go.d/smartctl.conf\n```\n#### Options\n\nThe following options can be defined globally: update_every.\n\n\n| Name | Description | Default | Required |\n|:----|:-----------|:-------|:--------:|\n| update_every | interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**. | 10 | no |\n| timeout | smartctl binary execution timeout. | 5 | no |\n| scan_every | interval for discovering new devices using `smartctl --scan`, measured in seconds. Set to 0 to scan devices only once on startup. | 900 | no |\n| poll_devices_every | interval for gathering data for every device, measured in seconds. Data is cached for this interval. | 300 | no |\n| device_selector | Specifies a pattern to match the 'info name' of devices as reported by `smartctl --scan --json`. | * | no |\n| extra_devices | Allows manual specification of devices not automatically detected by `smartctl --scan`. Each device entry must include both a name and a type. See \"Configuration Examples\" for details. | [] | no |\n| no_check_power_mode | Skip data collection when the device is in a low-power mode. Prevents unnecessary disk spin-up. | standby | no |\n\n##### no_check_power_mode\n\nThe valid arguments to this option are:\n\n| Mode | Description |\n|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| never | Check the device always. |\n| sleep | Check the device unless it is in SLEEP mode. |\n| standby | Check the device unless it is in SLEEP or STANDBY mode. In these modes most disks are not spinning, so if you want to prevent a disk from spinning up, this is probably what you want. |\n| idle | Check the device unless it is in SLEEP, STANDBY or IDLE mode. In the IDLE state, most disks are still spinning, so this is probably not what you want. |\n\n\n#### Examples\n\n##### Custom devices poll interval\n\nAllows you to override the default devices poll interval (data collection).\n\n```yaml\njobs:\n - name: smartctl\n devices_poll_interval: 60 # Collect S.M.A.R.T statistics every 60 seconds\n\n```\n##### Extra devices\n\nThis example demonstrates using `extra_devices` to manually add a storage device (`/dev/sdc`) not automatically detected by `smartctl --scan`.\n\n\n```yaml\njobs:\n - name: smartctl\n extra_devices:\n - name: /dev/sdc\n type: jmb39x-q,3\n\n```\n", "troubleshooting": "## Troubleshooting\n\n### Debug Mode\n\nTo troubleshoot issues with the `smartctl` collector, run the `go.d.plugin` with the debug option enabled. The output\nshould give you clues as to why the collector isn't working.\n\n- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on\n your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`.\n\n ```bash\n cd /usr/libexec/netdata/plugins.d/\n ```\n\n- Switch to the `netdata` user.\n\n ```bash\n sudo -u netdata -s\n ```\n\n- Run the `go.d.plugin` to debug the collector:\n\n ```bash\n ./go.d.plugin -d -m smartctl\n ```\n\n### Getting Logs\n\nIf you're encountering problems with the `smartctl` collector, follow these steps to retrieve logs and identify potential issues:\n\n- **Run the command** specific to your system (systemd, non-systemd, or Docker container).\n- **Examine the output** for any warnings or error messages that might indicate issues. These messages should provide clues about the root cause of the problem.\n\n#### System with systemd\n\nUse the following command to view logs generated since the last Netdata service restart:\n\n```bash\njournalctl _SYSTEMD_INVOCATION_ID=\"$(systemctl show --value --property=InvocationID netdata)\" --namespace=netdata --grep smartctl\n```\n\n#### System without systemd\n\nLocate the collector log file, typically at `/var/log/netdata/collector.log`, and use `grep` to filter for collector's name:\n\n```bash\ngrep smartctl /var/log/netdata/collector.log\n```\n\n**Note**: This method shows logs from all restarts. Focus on the **latest entries** for troubleshooting current issues.\n\n#### Docker Container\n\nIf your Netdata runs in a Docker container named \"netdata\" (replace if different), use this command:\n\n```bash\ndocker logs netdata 2>&1 | grep smartctl\n```\n\n", "alerts": "## Alerts\n\nThere are no alerts configured by default for this integration.\n", "metrics": "## Metrics\n\nMetrics grouped by *scope*.\n\nThe scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels.\n\n\n\n### Per controller\n\nThese metrics refer to the Storage Device.\n\nLabels:\n\n| Label | Description |\n|:-----------|:----------------|\n| device_name | Device name |\n| device_type | Device type |\n| model_name | Model name |\n| serial_number | Serial number |\n\nMetrics:\n\n| Metric | Dimensions | Unit |\n|:------|:----------|:----|\n| smartctl.device_smart_status | passed, failed | status |\n| smartctl.device_ata_smart_error_log_count | error_log | logs |\n| smartctl.device_power_on_time | power_on_time | seconds |\n| smartctl.device_temperature | temperature | Celsius |\n| smartctl.device_power_cycles_count | power | cycles |\n| smartctl.device_read_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_write_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_verify_errors_rate | corrected, uncorrected | errors/s |\n| smartctl.device_smart_attr_{attribute_name} | {attribute_name} | {attribute_unit} |\n| smartctl.device_smart_attr_{attribute_name}_normalized | {attribute_name} | value |\n\n", diff --git a/src/go/plugin/go.d/modules/smartctl/integrations/s.m.a.r.t..md b/src/go/plugin/go.d/modules/smartctl/integrations/s.m.a.r.t..md index 45aa9526802eb5..b3037c4b5b5fa1 100644 --- a/src/go/plugin/go.d/modules/smartctl/integrations/s.m.a.r.t..md +++ b/src/go/plugin/go.d/modules/smartctl/integrations/s.m.a.r.t..md @@ -159,7 +159,7 @@ The following options can be defined globally: update_every. |:----|:-----------|:-------|:--------:| | update_every | interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**. | 10 | no | | timeout | smartctl binary execution timeout. | 5 | no | -| scan_every | interval for discovering new devices using `smartctl --scan`, measured in seconds. | 900 | no | +| scan_every | interval for discovering new devices using `smartctl --scan`, measured in seconds. Set to 0 to scan devices only once on startup. | 900 | no | | poll_devices_every | interval for gathering data for every device, measured in seconds. Data is cached for this interval. | 300 | no | | device_selector | Specifies a pattern to match the 'info name' of devices as reported by `smartctl --scan --json`. | * | no | | extra_devices | Allows manual specification of devices not automatically detected by `smartctl --scan`. Each device entry must include both a name and a type. See "Configuration Examples" for details. | [] | no | From 7aeb251f5486706c1f553da4938cb66cc93268eb Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Sun, 14 Jul 2024 23:01:07 +0300 Subject: [PATCH 17/18] go.d smartctl: use scan-open when "no_check_power_mode" is "never" (#18146) --- src/go/plugin/go.d/modules/smartctl/exec.go | 4 ++ src/go/plugin/go.d/modules/smartctl/scan.go | 47 +++++++++++++------ .../plugin/go.d/modules/smartctl/smartctl.go | 1 + .../go.d/modules/smartctl/smartctl_test.go | 4 ++ 4 files changed, 42 insertions(+), 14 deletions(-) diff --git a/src/go/plugin/go.d/modules/smartctl/exec.go b/src/go/plugin/go.d/modules/smartctl/exec.go index 48bd11d80ef1f3..289501f7111081 100644 --- a/src/go/plugin/go.d/modules/smartctl/exec.go +++ b/src/go/plugin/go.d/modules/smartctl/exec.go @@ -33,6 +33,10 @@ func (e *smartctlCliExec) scan() (*gjson.Result, error) { return e.execute("smartctl-json-scan") } +func (e *smartctlCliExec) scanOpen() (*gjson.Result, error) { + return e.execute("smartctl-json-scan-open") +} + func (e *smartctlCliExec) deviceInfo(deviceName, deviceType, powerMode string) (*gjson.Result, error) { return e.execute("smartctl-json-device-info", "--deviceName", deviceName, diff --git a/src/go/plugin/go.d/modules/smartctl/scan.go b/src/go/plugin/go.d/modules/smartctl/scan.go index 06c8cdcb7fa6a0..58af0b0cad4d15 100644 --- a/src/go/plugin/go.d/modules/smartctl/scan.go +++ b/src/go/plugin/go.d/modules/smartctl/scan.go @@ -6,6 +6,8 @@ import ( "errors" "fmt" "strings" + + "github.com/tidwall/gjson" ) type scanDevice struct { @@ -24,7 +26,22 @@ func (s *scanDevice) shortName() string { } func (s *Smartctl) scanDevices() (map[string]*scanDevice, error) { - resp, err := s.exec.scan() + powerModeNever := s.NoCheckPowerMode == "never" + + var resp *gjson.Result + var err error + + // Issue on Discord: https://discord.com/channels/847502280503590932/1261747175361347644/1261747175361347644 + // "sat" devices being identified as "scsi" with --scan, and then later + // code attempts to validate the type by calling `smartctl` with the "scsi" type. + // This validation can trigger unintended "Enabling discard_zeroes_data" messages in system logs (dmesg). + // To address this specific issue we use `smartctl --scan-open` as a workaround. + // This method reliably identifies device types. + if powerModeNever { + resp, err = s.exec.scanOpen() + } else { + resp, err = s.exec.scan() + } if err != nil { return nil, fmt.Errorf("failed to scan devices: %v", err) } @@ -35,7 +52,7 @@ func (s *Smartctl) scanDevices() (map[string]*scanDevice, error) { dev := &scanDevice{ name: d.Get("name").String(), infoName: d.Get("info_name").String(), - typ: d.Get("type").String(), // guessed type (we do '--scan' not '--scan-open') + typ: d.Get("type").String(), // guessed type when using '--scan' instead of '--scan-open' } if dev.name == "" || dev.typ == "" { @@ -48,19 +65,21 @@ func (s *Smartctl) scanDevices() (map[string]*scanDevice, error) { continue } - if dev.typ == "scsi" { - // `smartctl --scan` attempts to guess the device type based on the path, but this can be unreliable. - // Accurate device type information is crucial because we use the `--device` option to gather data. - // Using the wrong type can lead to issues. - // For example, using 'scsi' for 'sat' devices prevents `smartctl` from issuing the necessary ATA commands. - d := scanDevice{name: dev.name, typ: "sat"} - if _, ok := s.scannedDevices[d.key()]; ok { - dev.typ = "sat" - } else { - resp, _ := s.exec.deviceInfo(dev.name, dev.typ, s.NoCheckPowerMode) - if resp != nil && isExitStatusHasBit(resp, 2) { - s.Debugf("changing device '%s' type 'scsi' -> 'sat'", dev.name) + if !powerModeNever { + if dev.typ == "scsi" { + // `smartctl --scan` attempts to guess the device type based on the path, but this can be unreliable. + // Accurate device type information is crucial because we use the `--device` option to gather data. + // Using the wrong type can lead to issues. + // For example, using 'scsi' for 'sat' devices prevents `smartctl` from issuing the necessary ATA commands. + d := scanDevice{name: dev.name, typ: "sat"} + if _, ok := s.scannedDevices[d.key()]; ok { dev.typ = "sat" + } else { + resp, _ := s.exec.deviceInfo(dev.name, dev.typ, s.NoCheckPowerMode) + if resp != nil && isExitStatusHasBit(resp, 2) { + s.Debugf("changing device '%s' type 'scsi' -> 'sat'", dev.name) + dev.typ = "sat" + } } } } diff --git a/src/go/plugin/go.d/modules/smartctl/smartctl.go b/src/go/plugin/go.d/modules/smartctl/smartctl.go index 0ba0491794f649..777336d95bb38f 100644 --- a/src/go/plugin/go.d/modules/smartctl/smartctl.go +++ b/src/go/plugin/go.d/modules/smartctl/smartctl.go @@ -83,6 +83,7 @@ type ( } smartctlCli interface { scan() (*gjson.Result, error) + scanOpen() (*gjson.Result, error) deviceInfo(deviceName, deviceType, powerMode string) (*gjson.Result, error) } ) diff --git a/src/go/plugin/go.d/modules/smartctl/smartctl_test.go b/src/go/plugin/go.d/modules/smartctl/smartctl_test.go index 2f8d4d66a13229..f66c3739a0131d 100644 --- a/src/go/plugin/go.d/modules/smartctl/smartctl_test.go +++ b/src/go/plugin/go.d/modules/smartctl/smartctl_test.go @@ -477,6 +477,10 @@ func (m *mockSmartctlCliExec) scan() (*gjson.Result, error) { return &res, nil } +func (m *mockSmartctlCliExec) scanOpen() (*gjson.Result, error) { + return m.scan() +} + func (m *mockSmartctlCliExec) deviceInfo(deviceName, deviceType, powerMode string) (*gjson.Result, error) { if m.deviceDataFunc == nil { return nil, nil From 3d189958a2eb0e1e40cdb7f39395781e98101f72 Mon Sep 17 00:00:00 2001 From: netdatabot Date: Mon, 15 Jul 2024 00:18:09 +0000 Subject: [PATCH 18/18] [ci skip] Update changelog and version for nightly build: v1.46.0-157-nightly. --- CHANGELOG.md | 7 ++++--- packaging/version | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a451c9299fac4d..8a56e3d71d0743 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ **Merged pull requests:** +- go.d smartctl: use scan-open when "no\_check\_power\_mode" is "never" [\#18146](https://github.com/netdata/netdata/pull/18146) ([ilyam8](https://github.com/ilyam8)) +- Regenerate integrations.js [\#18145](https://github.com/netdata/netdata/pull/18145) ([netdatabot](https://github.com/netdatabot)) +- go.d smartctl: do scan only once on startup if interval is 0 [\#18144](https://github.com/netdata/netdata/pull/18144) ([ilyam8](https://github.com/ilyam8)) +- ndsudo add smartctl scan-open [\#18143](https://github.com/netdata/netdata/pull/18143) ([ilyam8](https://github.com/ilyam8)) - Regenerate integrations.js [\#18141](https://github.com/netdata/netdata/pull/18141) ([netdatabot](https://github.com/netdatabot)) - go.d smartctl add "extra\_devices" option [\#18140](https://github.com/netdata/netdata/pull/18140) ([ilyam8](https://github.com/ilyam8)) - Spawn server fixes 6 [\#18136](https://github.com/netdata/netdata/pull/18136) ([ktsaou](https://github.com/ktsaou)) @@ -409,9 +413,6 @@ - Fix DEB package conflict entries. [\#17584](https://github.com/netdata/netdata/pull/17584) ([Ferroin](https://github.com/Ferroin)) - fix ndsudo setuid bit for static builds [\#17583](https://github.com/netdata/netdata/pull/17583) ([ilyam8](https://github.com/ilyam8)) - fix table [\#17581](https://github.com/netdata/netdata/pull/17581) ([hugovalente-pm](https://github.com/hugovalente-pm)) -- Fix invalid item in postinst script for Netdata package. [\#17580](https://github.com/netdata/netdata/pull/17580) ([Ferroin](https://github.com/Ferroin)) -- Regenerate integrations.js [\#17578](https://github.com/netdata/netdata/pull/17578) ([netdatabot](https://github.com/netdatabot)) -- Cpack fixes [\#17576](https://github.com/netdata/netdata/pull/17576) ([vkalintiris](https://github.com/vkalintiris)) ## [v1.45.6](https://github.com/netdata/netdata/tree/v1.45.6) (2024-06-05) diff --git a/packaging/version b/packaging/version index c31ee995c07ab9..8a7e020f190cf4 100644 --- a/packaging/version +++ b/packaging/version @@ -1 +1 @@ -v1.46.0-152-nightly +v1.46.0-157-nightly