Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NAS-130745 / 25.04 / Update smartctl usage to use JSON #14479

Merged
merged 32 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
df69897
Updated smartctl usage
aiden3c Aug 29, 2024
cc78135
Fixed syntax
aiden3c Aug 29, 2024
d36f54f
Update tests
aiden3c Sep 3, 2024
016d61d
Merge remote-tracking branch 'origin/master' into NAS-130745
aiden3c Sep 3, 2024
92af136
Merge remote-tracking branch 'origin/master' into NAS-130745
aiden3c Sep 10, 2024
dec3cb1
Merge remote-tracking branch 'origin/master' into NAS-130745
aiden3c Sep 10, 2024
b8fadfe
Unit test fixes
aiden3c Sep 10, 2024
392a162
Load the JSON
aiden3c Sep 10, 2024
6b18210
Quote confusion
aiden3c Sep 10, 2024
bced6a7
Self test fix
aiden3c Sep 10, 2024
0c9b2fc
Fixed remaining
aiden3c Sep 10, 2024
e9bcbbe
Actually the test is wrong
aiden3c Sep 10, 2024
ca58c73
Unit test fix, smart key update
aiden3c Sep 10, 2024
dc5091e
Fix for sct/code
aiden3c Sep 10, 2024
c1ade49
More fixes
aiden3c Sep 10, 2024
343c96d
Two out of three?
aiden3c Sep 10, 2024
d1f38e8
Key fixes
aiden3c Sep 11, 2024
fa3011d
Final fix
aiden3c Sep 11, 2024
21e1bb2
Suggested change: remove unnecessary json loads
aiden3c Sep 13, 2024
da4bce6
Oops
aiden3c Sep 13, 2024
a80b5a6
Report smart attributes for all relevant drive types
aiden3c Sep 13, 2024
2f1ef08
Schema test
aiden3c Sep 16, 2024
9ad689d
Refactor schema and tests
aiden3c Sep 17, 2024
36eefe3
Weird import?
aiden3c Sep 17, 2024
0647da7
Fixed scheme implementation
aiden3c Sep 17, 2024
40fa526
Updating implementation
aiden3c Sep 17, 2024
48f8597
Final fix
aiden3c Sep 17, 2024
0412117
Float fix
aiden3c Sep 17, 2024
bf0de90
Fixed disk smart reporting
aiden3c Sep 17, 2024
fa237ad
Unused regex
aiden3c Sep 17, 2024
4f790ef
Merge remote-tracking branch 'origin/master' into NAS-130745
aiden3c Sep 17, 2024
2ffd485
Suggested changes
aiden3c Sep 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions src/middlewared/middlewared/etc_files/smartd.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
import shlex
import subprocess
import json

from middlewared.common.smart.smartctl import get_smartctl_args, smartctl, SMARTCTX
from middlewared.plugins.smart_.schedule import SMARTD_SCHEDULE_PIECES, smartd_schedule_piece
Expand All @@ -23,15 +24,16 @@ async def ensure_smart_enabled(args):
if any(arg.startswith("/dev/nvme") for arg in args):
return True

p = await smartctl(args + ["-i"], stderr=subprocess.STDOUT, check=False, encoding="utf8", errors="ignore")
if not re.search("SMART.*abled", p.stdout):
p = await smartctl(args + ["-i", "--json=c"], check=False, stderr=subprocess.STDOUT, encoding="utf8", errors="ignore")
pjson = json.loads(p.stdout)
if not pjson["smart_support"]["available"]:
logger.debug("SMART is not supported on %r", args)
return False

if re.search("SMART.*Enabled", p.stdout):
if pjson["smart_support"]["enabled"]:
return True

p = await smartctl(args + ["-s", "on"], stderr=subprocess.STDOUT, check=False)
p = await smartctl(args + ["-s", "on"], check=False, stderr=subprocess.STDOUT)
if p.returncode == 0:
return True
else:
Expand Down
4 changes: 1 addition & 3 deletions src/middlewared/middlewared/plugins/disk.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,7 @@ async def disk_extend(self, disk, context):
if await self.middleware.call('truenas.is_ix_hardware'):
disk['supports_smart'] = True
else:
disk['supports_smart'] = disk['name'].startswith('nvme') or bool(RE_SMART_AVAILABLE.search(
await self.middleware.call('disk.smartctl', disk['name'], ['-a'], {'silent': True}) or ''
))
disk['supports_smart'] = disk['name'].startswith('nvme') or await self.middleware.call('disk.smartctl', disk['name'], ['-a', '--json=c'], {'silent': True})['smart_support']['available']
aiden3c marked this conversation as resolved.
Show resolved Hide resolved

if disk['name'] in context['boot_pool_disks']:
disk['pool'] = context['boot_pool_name']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ async def smart_attributes(self, name):
"""
Returns S.M.A.R.T. attributes values for specified disk name.
"""
output = json.loads(await self.middleware.call('disk.smartctl', name, ['-A', '-j']))
output = json.loads(await self.middleware.call('disk.smartctl', name, ['-A', '--json=c']))

if 'ata_smart_attributes' in output:
aiden3c marked this conversation as resolved.
Show resolved Hide resolved
return output['ata_smart_attributes']['table']
Expand Down
6 changes: 3 additions & 3 deletions src/middlewared/middlewared/plugins/disk_/temperature.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import asyncio
import datetime
import time
import json

import async_timeout

Expand Down Expand Up @@ -61,9 +62,8 @@ async def temperature(self, name, options):

@private
async def temperature_uncached(self, name, powermode):
output = await self.middleware.call('disk.smartctl', name, ['-a', '-n', powermode.lower()], {'silent': True})
if output is not None:
return parse_smartctl_for_temperature_output(output)
if output := await self.middleware.call('disk.smartctl', name, ['-a', '-n', powermode.lower(), '--json=c'], {'silent': True}):
return parse_smartctl_for_temperature_output(json.loads(output))

@private
async def reset_temperature_cache(self):
Expand Down
190 changes: 87 additions & 103 deletions src/middlewared/middlewared/plugins/smart.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import functools
import re
import time
import json

from humanize import ordinal

Expand All @@ -21,159 +22,142 @@
RE_TIME = re.compile(r'test will complete after ([a-z]{3} [a-z]{3} [0-9 ]+ \d\d:\d\d:\d\d \d{4})', re.IGNORECASE)
RE_TIME_SCSIPRINT_EXTENDED = re.compile(r'Please wait (\d+) minutes for test to complete')

RE_OF_TEST_REMAINING = re.compile(r'([0-9]+)% of test remaining')
RE_SELF_TEST_STATUS = re.compile(r'self-test in progress \(([0-9]+)% completed\)')


async def annotate_disk_smart_tests(middleware, tests_filter, disk):
if disk["disk"] is None:
return

output = await middleware.call("disk.smartctl", disk["disk"], ["-a"], {"silent": True})
output = await middleware.call("disk.smartctl", disk["disk"], ["-a", "--json=c"], {"silent": True})
if output is None:
return
data = json.loads(output)

tests = parse_smart_selftest_results(output) or []
current_test = parse_current_smart_selftest(output)
tests = parse_smart_selftest_results(data) or []
current_test = parse_current_smart_selftest(data)
return dict(tests=filter_list(tests, tests_filter), current_test=current_test, **disk)


def parse_smart_selftest_results(stdout):
def parse_smart_selftest_results(data):
tests = []

# ataprint.cpp
if "LBA_of_first_error" in stdout:
for line in stdout.split("\n"):
if not line.startswith("#"):
continue

if line[58] == "%":
remaining = line[55:58]
lifetime = line[61:69]
else:
remaining = line[55:57]
lifetime = line[60:68]
if "ata_smart_self_test_log" in data:
if "table" in data["ata_smart_self_test_log"]["standard"]: # If there are no tests, there is no table
for index, entry in enumerate(data["ata_smart_self_test_log"]["standard"]["table"]):

test = {
"num": int(line[1:3].strip()),
"description": line[5:24].strip(),
"status_verbose": line[25:54].strip(),
"remaining": int(remaining.strip()) / 100,
"lifetime": int(lifetime.strip()),
"lba_of_first_error": line[77:].strip(),
}
# remaining_percent is in the dict only if the test is in progress (status value & 0x0f)
if remaining := entry["status"]["value"] & 0x0f:
aiden3c marked this conversation as resolved.
Show resolved Hide resolved
remaining = entry["status"]["remaining_percent"]

if test["status_verbose"] == "Completed without error":
test["status"] = "SUCCESS"
elif test["status_verbose"] == "Self-test routine in progress":
test["status"] = "RUNNING"
elif test["status_verbose"] in ["Aborted by host", "Interrupted (host reset)"]:
test["status"] = "ABORTED"
else:
test["status"] = "FAILED"
test = {
"num": index,
"description": entry["type"]["string"],
"status_verbose": entry["status"]["string"],
"remaining": remaining,
"lifetime": entry["lifetime_hours"],
"lba_of_first_error": entry.get("lba"), # only included if there is an error
}

if test["lba_of_first_error"] == "-":
test["lba_of_first_error"] = None
if test["status_verbose"] == "Completed without error":
test["status"] = "SUCCESS"
elif test["status_verbose"] == "Self-test routine in progress":
test["status"] = "RUNNING"
elif test["status_verbose"] in ["Aborted by host", "Interrupted (host reset)"]:
test["status"] = "ABORTED"
else:
test["status"] = "FAILED"

tests.append(test)
tests.append(test)

return tests

# nvmeprint.cpp
if "Failing_LBA" in stdout:
got_header = False
for line in stdout.split("\n"):
if "Failing_LBA" in line:
got_header = True
continue

if not got_header:
continue

try:
status_verbose = line[23:56].strip()
if status_verbose == "Completed without error":
status = "SUCCESS"
elif status_verbose.startswith("Aborted:"):
status = "ABORTED"
else:
status = "FAILED"
if "nvme_self_test_log" in data:
if "table" in data["nvme_self_test_log"]:
for index, entry in enumerate(data["nvme_self_test_log"]["table"]):

failing_lba = line[67:79].strip()
nsid = line[80:85].strip()
seg = line[86:89].strip()
sct = line[90:93]
code = line[94:98]
if lba := entry.get("lba"):
lba = entry["lba"]["value"]

test = {
"num": int(line[0:2].strip()),
"description": line[5:22].strip(),
"status": status,
"status_verbose": status_verbose,
"power_on_hours": int(line[57:66].strip()),
"failing_lba": None if failing_lba == "-" else int(failing_lba),
"nsid": None if nsid == "-" else nsid,
"seg": None if seg == "-" else int(seg),
"sct": sct,
"code": code,
"num": index,
"description": entry["self_test_code"]["string"],
"status_verbose": entry["self_test_result"]["string"],
"power_on_hours": entry["power_on_hours"],
"failing_lba": lba,
"nsid": entry.get("nsid"),
"seg": entry.get("segment"),
"sct": entry.get("status_code_type") or 0x0,
aiden3c marked this conversation as resolved.
Show resolved Hide resolved
"code": entry.get("status_code") or 0x0,
}
except ValueError:
break

tests.append(test)
if test["status_verbose"] == "Completed without error":
test["status"] = "SUCCESS"
elif test["status_verbose"].startswith("Aborted:"):
test["status"] = "ABORTED"
else:
test["status"] = "FAILED"

tests.append(test)

return tests

# scsiprint.cpp
if "LBA_first_err" in stdout:
for line in stdout.split("\n"):
if not line.startswith("#"):
continue
# this JSON has numbered keys as an index, there's a reason it's not called a "smart" test
if "scsi_self_test_0" in data: # 0 is most recent test
for index in range(0, 20): # only 20 tests can ever return
test_key = f"scsi_self_test_{index}"
if not test_key in data:
break
entry = data[test_key]

if segment := entry.get("failed_segment"):
segment = entry["failed_segment"]["value"]

if lba := entry.get("lba_first_failure"):
lba = entry["lba_first_failure"]["value"]

lifetime = 0
if not entry.get("self_test_in_progress"):
lifetime = entry["power_on_time"]["hours"]

test = {
"num": int(line[1:3].strip()),
"description": line[5:22].strip(),
"status_verbose": line[23:48].strip(),
"segment_number": line[49:52].strip(),
"lifetime": line[55:60].strip(),
"lba_of_first_error": line[60:78].strip(),
"num": index,
"description": entry["code"]["string"],
"status_verbose": entry["result"]["string"],
"segment_number": segment,
"lifetime": lifetime,
"lba_of_first_error": lba,
}

if test["status_verbose"] == "Completed":
test["status"] = "SUCCESS"
elif test["status_verbose"] == "Self test in progress ...":
test["status"] = "RUNNING"
elif test["status_verbose"] in ["Aborted (by user command)", "Aborted (device reset ?)"]:
elif test["status_verbose"].startswith("Aborted"):
test["status"] = "ABORTED"
else:
test["status"] = "FAILED"

if test["segment_number"] == "-":
test["segment_number"] = None
else:
test["segment_number"] = int(test["segment_number"])

if test["lifetime"] == "NOW":
test["lifetime"] = None
else:
test["lifetime"] = int(test["lifetime"])

if test["lba_of_first_error"] == "-":
test["lba_of_first_error"] = None

tests.append(test)

return tests


def parse_current_smart_selftest(stdout):
if remaining := RE_OF_TEST_REMAINING.search(stdout):
return {"progress": 100 - int(remaining.group(1))}
def parse_current_smart_selftest(data):
# ata
if "ata_smart_self_test_log" in data:
if tests := data["ata_smart_self_test_log"]["standard"].get("table"):
if remaining := tests[0]["status"].get("remaining_percent"):
return {"progress": 100 - remaining}

if remaining := RE_SELF_TEST_STATUS.search(stdout):
return {"progress": int(remaining.group(1))}
# nvme
if "nvme_self_test_log" in data:
if remaining := data["nvme_self_test_log"].get("current_self_test_completion_percent"):
return {"progress": remaining}

if "Self test in progress ..." in stdout:
# scsi gives no progress
if "self_test_in_progress" in data:
return {"progress": 0}


Expand Down
14 changes: 7 additions & 7 deletions src/middlewared/middlewared/pytest/unit/etc_files/test_smartd.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
@pytest.mark.asyncio
async def test__ensure_smart_enabled__smart_error():
with patch("middlewared.etc_files.smartd.smartctl") as run:
run.return_value = Mock(stdout="S.M.A.R.T. Error")
run.return_value = Mock(stdout='{"smart_support": {"enabled": false, "available": false}}')

assert await ensure_smart_enabled(["/dev/ada0"]) is False

Expand All @@ -22,7 +22,7 @@ async def test__ensure_smart_enabled__smart_error():
@pytest.mark.asyncio
async def test__ensure_smart_enabled__smart_enabled():
with patch("middlewared.etc_files.smartd.smartctl") as run:
run.return_value = Mock(stdout="SMART Enabled")
run.return_value = Mock(stdout='{"smart_support": {"enabled": true, "available": true}}')

assert await ensure_smart_enabled(["/dev/ada0"])

Expand All @@ -32,12 +32,12 @@ async def test__ensure_smart_enabled__smart_enabled():
@pytest.mark.asyncio
async def test__ensure_smart_enabled__smart_was_disabled():
with patch("middlewared.etc_files.smartd.smartctl") as run:
run.return_value = Mock(stdout="SMART Disabled", returncode=0)
run.return_value = Mock(stdout='{"smart_support": {"enabled": false, "available": true}}', returncode=0)

assert await ensure_smart_enabled(["/dev/ada0"])

assert run.call_args_list == [
call(["/dev/ada0", "-i"], check=False, stderr=subprocess.STDOUT,
call(["/dev/ada0", "-i", "--json=c"], check=False, stderr=subprocess.STDOUT,
encoding="utf8", errors="ignore"),
call(["/dev/ada0", "-s", "on"], check=False, stderr=subprocess.STDOUT),
]
Expand All @@ -46,20 +46,20 @@ async def test__ensure_smart_enabled__smart_was_disabled():
@pytest.mark.asyncio
async def test__ensure_smart_enabled__enabling_smart_failed():
with patch("middlewared.etc_files.smartd.smartctl") as run:
run.return_value = Mock(stdout="SMART Disabled", returncode=1)
run.return_value = Mock(stdout='{"smart_support": {"enabled": false, "available": false}}', returncode=1)

assert await ensure_smart_enabled(["/dev/ada0"]) is False


@pytest.mark.asyncio
async def test__ensure_smart_enabled__handled_args_properly():
with patch("middlewared.etc_files.smartd.smartctl") as run:
run.return_value = Mock(stdout="SMART Enabled")
run.return_value = Mock(stdout='{"smart_support": {"enabled": true, "available": true}}')

assert await ensure_smart_enabled(["/dev/ada0", "-d", "sat"])

run.assert_called_once_with(
["/dev/ada0", "-d", "sat", "-i"], check=False, stderr=subprocess.STDOUT,
["/dev/ada0", "-d", "sat", "-i", "--json=c"], check=False, stderr=subprocess.STDOUT,
encoding="utf8", errors="ignore",
)

Expand Down
Loading
Loading