From a27d8b3a3413fa366ca208f0bb2561fecd52c04c Mon Sep 17 00:00:00 2001 From: Andy Wingo Date: Tue, 5 Mar 2019 14:11:02 +0000 Subject: [PATCH] Add more system performance diagnostics This patch extends the NUMA module to be able to run some self-checks when it is used to reserve a CPU for a data plane, for example checking the CPU frequency scaling governor or the set of isolated CPUs. --- src/lib/README.numa.md | 23 ++++++++- src/lib/cpuset.lua | 63 ++--------------------- src/lib/numa.lua | 110 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 122 insertions(+), 74 deletions(-) diff --git a/src/lib/README.numa.md b/src/lib/README.numa.md index aabc2e7874..608903bbb8 100644 --- a/src/lib/README.numa.md +++ b/src/lib/README.numa.md @@ -12,11 +12,17 @@ for some reason the current process is not bound to a NUMA node. See [../doc/performance-tuning.md] for more notes on getting good performance out of your Snabb program. -— Function **bind_to_cpu** *cpu* +— Function **bind_to_cpu** *cpu* *skip_perf_checks* Bind the current process to *cpu*, arranging for it to only ever be run on that CPU. Additionally, call **bind_to_numa_node** on the NUMA node corresponding to *cpu*. +Unless the optional argument *skip_perf_checks* is true, also run some +basic checks to verify that the given core is suitable for processing +low-latency network traffic: that the CPU has the `performance` scaling +governor, that it has been reserved from the kernel scheduler, and so +on, printing out any problems to `stderr`. + — Function **bind_to_numa_node** *node* Bind the current process to NUMA node *node*, arranging for it to only ever allocate memory local to that NUMA node. Additionally, migrate @@ -55,3 +61,18 @@ node bound by **bind_to_numa_node**, if present, and in any case that all *addrs* are on the same NUMA node. If *require_affinity* is true (not the default), then error if a problem is detected, otherwise just print a warning to the console. + +— Function **parse_cpuset** *cpus* +A helper function to parse a CPU set from a string. A CPU set is either +the number of a CPU, a range of CPUs, or two or more CPU sets joined by +commas. The result is a table whose keys are the CPUs and whose values +are true (a set). For example, q`parse_cpuset("1-3,5")` will return a +table with keys 1, 2, 3, and 5 bound to `true`. + +— Function **node_cpus** *node* +Return a set of CPUs belonging to NUMA node *node*, in the same format +as in **parse_cpuset**. + +— Function **isolated_cpus** +Return a set of CPUs that have been "isolated" away from the kernel at +boot via the `isolcpus` kernel boot parameter. diff --git a/src/lib/cpuset.lua b/src/lib/cpuset.lua index 0adaaca54d..0905b9d83e 100644 --- a/src/lib/cpuset.lua +++ b/src/lib/cpuset.lua @@ -19,53 +19,7 @@ do end end -local function trim (str) - return str:gsub("^%s", ""):gsub("%s$", "") -end - -local function parse_cpulist (cpus) - local ret = {} - cpus = trim(cpus) - if #cpus == 0 then return ret end - for range in cpus:split(',') do - local lo, hi = range:match("^%s*([^%-]*)%s*-%s*([^%-%s]*)%s*$") - if lo == nil then lo = range:match("^%s*([^%-]*)%s*$") end - assert(lo ~= nil, 'invalid range: '..range) - lo = assert(tonumber(lo), 'invalid range begin: '..lo) - assert(lo == math.floor(lo), 'invalid range begin: '..lo) - if hi ~= nil then - hi = assert(tonumber(hi), 'invalid range end: '..hi) - assert(hi == math.floor(hi), 'invalid range end: '..hi) - assert(lo < hi, 'invalid range: '..range) - else - hi = lo - end - for cpu=lo,hi do table.insert(ret, cpu) end - end - return ret -end - -local function parse_cpulist_from_file (path) - local fd = assert(io.open(path)) - if not fd then return {} end - local ret = parse_cpulist(fd:read("*all")) - fd:close() - return ret -end - local function available_cpus (node) - local function set (t) - local ret = {} - for _,v in pairs(t) do ret[tostring(v)] = true end - return ret - end - local function cpus_in_node (node) - local node_path = '/sys/devices/system/node/node'..node - return set(parse_cpulist_from_file(node_path..'/cpulist')) - end - local function isolated_cpus () - return set(parse_cpulist_from_file('/sys/devices/system/cpu/isolated')) - end local function subtract (s, t) local ret = {} for k,_ in pairs(s) do @@ -75,7 +29,7 @@ local function available_cpus (node) return ret end -- XXX: Add sched_getaffinity cpus. - return subtract(cpus_in_node(node), isolated_cpus()) + return subtract(numa.node_cpus(node), numa.isolated_cpus()) end function CPUSet:bind_to_numa_node() @@ -87,7 +41,7 @@ function CPUSet:bind_to_numa_node() numa.bind_to_numa_node(nodes[1]) local cpus = available_cpus(nodes[1]) assert(#cpus > 0, 'Not available CPUs') - numa.bind_to_cpu(cpus) + numa.bind_to_cpu(cpus, 'skip-perf-checks') print(("Bound main process to NUMA node: %s (CPU %s)"):format(nodes[1], cpus[1])) else print("CPUs available from multiple NUMA nodes: "..table.concat(nodes, ",")) @@ -96,7 +50,7 @@ function CPUSet:bind_to_numa_node() end function CPUSet:add_from_string(cpus) - for _, cpu in ipairs(parse_cpulist(cpus)) do + for cpu,_ in pairs(numa.parse_cpuset(cpus)) do self:add(cpu) end end @@ -157,14 +111,3 @@ function CPUSet:release(cpu) end error('CPU not found on NUMA node: '..cpu..', '..node) end - -function selftest () - print('selftest: cpuset') - local cpus = parse_cpulist("0-5,7") - assert(#cpus == 7 and cpus[6] == 5 and cpus[7] == 7) - cpus = parse_cpulist("1") - assert(#cpus == 1 and cpus[1] == 1) - assert(#parse_cpulist("\n") == 0) - assert(#parse_cpulist("") == 0) - print('selftest: ok') -end diff --git a/src/lib/numa.lua b/src/lib/numa.lua index cd86b6ce77..60cb80af6b 100644 --- a/src/lib/numa.lua +++ b/src/lib/numa.lua @@ -18,6 +18,57 @@ local bound_numa_node local node_path = '/sys/devices/system/node/node' local MAX_CPU = 1023 +local function warn(fmt, ...) + io.stderr:write(string.format("Warning: ".. fmt .. "\n", ...)) + io.stderr:flush() +end + +local function die(fmt, ...) + error(string.format(fmt, ...)) +end + +local function trim (str) + return str:gsub("^%s", ""):gsub("%s$", "") +end + +function parse_cpuset (cpus) + local ret = {} + cpus = trim(cpus) + if #cpus == 0 then return ret end + for range in cpus:split(',') do + local lo, hi = range:match("^%s*([^%-]*)%s*-%s*([^%-%s]*)%s*$") + if lo == nil then lo = range:match("^%s*([^%-]*)%s*$") end + assert(lo ~= nil, 'invalid range: '..range) + lo = assert(tonumber(lo), 'invalid range begin: '..lo) + assert(lo == math.floor(lo), 'invalid range begin: '..lo) + if hi ~= nil then + hi = assert(tonumber(hi), 'invalid range end: '..hi) + assert(hi == math.floor(hi), 'invalid range end: '..hi) + assert(lo < hi, 'invalid range: '..range) + else + hi = lo + end + for cpu=lo,hi do table.insert(ret, cpu) end + end + return lib.set(unpack(ret)) +end + +local function parse_cpuset_from_file (path) + local fd = assert(io.open(path)) + if not fd then return {} end + local ret = parse_cpuset(fd:read("*all")) + fd:close() + return ret +end + +function node_cpus (node) + return parse_cpuset_from_file(node_path..node..'/cpulist') +end + +function isolated_cpus (node) + return parse_cpuset_from_file('/sys/devices/system/cpu/isolated') +end + function cpu_get_numa_node (cpu) local node = 0 while true do @@ -62,10 +113,10 @@ function choose_numa_node_for_pci_addresses (addrs, require_affinity) chosen_node = node chosen_because_of_addr = addr else - local msg = string.format( - "PCI devices %s and %s have different NUMA node affinities", - chosen_because_of_addr, addr) - if require_affinity then error(msg) else print('Warning: '..msg) end + local warn = warn + if require_affinity then warn = die end + warn("PCI devices %s and %s have different NUMA node affinities", + chosen_because_of_addr, addr) end end return chosen_node @@ -75,17 +126,17 @@ function check_affinity_for_pci_addresses (addrs) local policy = S.get_mempolicy() if policy.mode == S.c.MPOL_MODE['default'] then if has_numa() then - print('Warning: No NUMA memory affinity.') - print('Pass --cpu to bind to a CPU and its NUMA node.') + warn('No NUMA memory affinity.\n'.. + 'Pass --cpu to bind to a CPU and its NUMA node.') end elseif (policy.mode ~= S.c.MPOL_MODE['bind'] and policy.mode ~= S.c.MPOL_MODE['preferred']) then - print("Warning: NUMA memory policy already in effect, but it's not --membind or --preferred.") + warn("NUMA memory policy already in effect, but it's not --membind or --preferred.") else local node = S.getcpu().node local node_for_pci = choose_numa_node_for_pci_addresses(addrs) if node_for_pci and node ~= node_for_pci then - print("Warning: Bound NUMA node does not have affinity with PCI devices.") + warn("Bound NUMA node does not have affinity with PCI devices.") end end end @@ -98,7 +149,7 @@ function unbind_cpu () bound_cpu = nil end -function bind_to_cpu (cpu) +function bind_to_cpu (cpu, skip_perf_checks) local function contains (t, e) for k,v in ipairs(t) do if tonumber(v) == tonumber(e) then return true end @@ -117,6 +168,14 @@ function bind_to_cpu (cpu) bound_cpu = cpu_and_node.cpu bind_to_numa_node (cpu_and_node.node) + + if not skip_perf_checks then + local ok, err = pcall(check_cpu_performance_tuning, bound_cpu) + if not ok then + warn("Error checking performance tuning on CPU %s: %s", + bound_cpu, tostring(err)) + end + end end function unbind_numa_node () @@ -138,9 +197,8 @@ function bind_to_numa_node (node, policy) local from_mask = assert(S.get_mempolicy(nil, nil, nil, 'mems_allowed')).mask local ok, err = S.migrate_pages(0, from_mask, node) if not ok then - io.stderr:write( - string.format("Warning: Failed to migrate pages to NUMA node %d: %s\n", - node, tostring(err))) + warn("Failed to migrate pages to NUMA node %d: %s\n", + node, tostring(err)) end end @@ -152,11 +210,37 @@ function prevent_preemption(priority) 'Failed to enable real-time scheduling. Try running as root.') end +function check_cpu_performance_tuning (cpu, strict) + local warn = warn + if strict then warn = die end + local path = '/sys/devices/system/cpu/cpu'..cpu..'/cpufreq/scaling_governor' + local gov = assert(io.open(path)):read() + if not gov:match('performance') then + warn('Expected performance scaling governor for CPU %s, but got "%s"', + cpu, gov) + end + + if not isolated_cpus()[cpu] then + warn('Expected dedicated core, but CPU %s is not in isolcpus set', cpu) + end +end + function selftest () + local cpus = parse_cpuset("0-5,7") + for i=0,5 do assert(cpus[i]) end + assert(not cpus[6]) + assert(cpus[7]) + do + local count = 0 + for k,v in pairs(cpus) do count = count + 1 end + assert(count == 7) + end + assert(parse_cpuset("1")[1]) + function test_cpu(cpu) local node = cpu_get_numa_node(cpu) - bind_to_cpu(cpu) + bind_to_cpu(cpu, 'skip-perf-checks') assert(bound_cpu == cpu) assert(bound_numa_node == node) assert(S.getcpu().cpu == cpu)