From 533ab038051298390ee1a61c721335f63ee6f96f Mon Sep 17 00:00:00 2001 From: Harald Fischer Date: Fri, 7 Jul 2023 00:05:49 +0200 Subject: [PATCH] Split metrics from `device` resource Remodel `device` resource and split metrics into `device metrics record` This reduces the load on the `device` resource when updating device metrics periodically. device metrics are far less frequently needed than the `device` resource. Thus writing to the heavily used `device` resource should be offloaded to the `device metrics record` resource. Change-type: major Signed-off-by: Harald Fischer --- src/balena-init.sql | 3 + src/balena-model.ts | 24 ++++-- src/balena.sbvr | 25 ++++++ src/features/cascade-delete/hooks.ts | 1 + src/features/device-state/index.ts | 4 +- .../device-state/routes/state-patch-v2.ts | 67 +++++++++------- .../device-state/routes/state-patch-v3.ts | 74 +++++++++++------- .../device-state/state-patch-utils.ts | 50 ++++++++---- src/lib/auth.ts | 8 ++ ...plit-device-metrics-from-device-schema.sql | 19 +++++ ...-split-device-metrics-from-device.async.ts | 78 +++++++++++++++++++ test/03_device-state.ts | 50 +++++++++--- 12 files changed, 313 insertions(+), 90 deletions(-) create mode 100644 src/migrations/00084-split-device-metrics-from-device-schema.sql create mode 100644 src/migrations/00085-split-device-metrics-from-device.async.ts diff --git a/src/balena-init.sql b/src/balena-init.sql index 9a1b34258..4e22fffa3 100644 --- a/src/balena-init.sql +++ b/src/balena-init.sql @@ -188,3 +188,6 @@ ON "application" ("slug" varchar_pattern_ops, "is public", "is host"); CREATE INDEX IF NOT EXISTS "scheduled_job_run_start_timestamp_idx" ON "scheduled job run" (DATE_TRUNC('milliseconds', "start timestamp")); + +CREATE INDEX IF NOT EXISTS "device_metrics_record_by_device_idx" +ON "device metrics record" ("is reported by-device"); diff --git a/src/balena-model.ts b/src/balena-model.ts index c97aa81de..c99cb84e2 100644 --- a/src/balena-model.ts +++ b/src/balena-model.ts @@ -364,14 +364,6 @@ export interface Device { public_address: string | null; ip_address: string | null; mac_address: string | null; - memory_usage: number | null; - memory_total: number | null; - storage_block_device: string | null; - storage_usage: number | null; - storage_total: number | null; - cpu_usage: number | null; - cpu_temp: number | null; - is_undervolted: boolean; cpu_id: string | null; is_running__release: { __id: number } | [Release?] | null; download_progress: number | null; @@ -403,6 +395,7 @@ export interface Device { service_install?: ServiceInstall[]; installs__image?: ImageInstall[]; installs__application__has__service_name?: ServiceInstall[]; + reports__device_metrics_record?: DeviceMetricsRecord[]; } export interface DeviceEnvironmentVariable { @@ -565,6 +558,21 @@ export interface UserHasPublicKey { title: string; } +export interface DeviceMetricsRecord { + created_at: DateString; + modified_at: DateString; + id: number; + is_reported_by__device: { __id: number } | [Device]; + memory_usage: number | null; + memory_total: number | null; + storage_block_device: string | null; + storage_usage: number | null; + storage_total: number | null; + cpu_usage: number | null; + cpu_temp: number | null; + is_undervolted: boolean; +} + export interface DeviceTypeAlias { created_at: DateString; modified_at: DateString; diff --git a/src/balena.sbvr b/src/balena.sbvr index ec352c508..50f94f93f 100644 --- a/src/balena.sbvr +++ b/src/balena.sbvr @@ -581,6 +581,31 @@ Fact type: device should be managed by release Necessity: each device should be managed by at most one release +Term: device metrics record + +-- device metrics record +Fact type: device metrics record is reported by device + Synonymous Form: device reports device metrics record + Necessity: each device metrics record is reported by exactly one device + Necessity: each device reports exactly one device metrics record + +Fact type: device metrics record has memory usage + Necessity: each device metrics record has at most one memory usage +Fact type: device metrics record has memory total + Necessity: each device metrics record has at most one memory total +Fact type: device metrics record has storage block device + Necessity: each device metrics record has at most one storage block device +Fact type: device metrics record has storage usage + Necessity: each device metrics record has at most one storage usage +Fact type: device metrics record has storage total + Necessity: each device metrics record has at most one storage total +Fact type: device metrics record has cpu usage + Necessity: each device metrics record has at most one cpu usage +Fact type: device metrics record has cpu temp + Necessity: each device metrics record has at most one cpu temp +Fact type: device metrics record is undervolted + + -- application config variable Fact type: application config variable has value diff --git a/src/features/cascade-delete/hooks.ts b/src/features/cascade-delete/hooks.ts index 69f4764cf..d159ce77e 100644 --- a/src/features/cascade-delete/hooks.ts +++ b/src/features/cascade-delete/hooks.ts @@ -22,6 +22,7 @@ setupDeleteCascade('device', { device_tag: 'device', image_install: 'device', service_install: 'device', + device_metrics_record: 'is_reported_by__device', }); setupDeleteCascade('image', { diff --git a/src/features/device-state/index.ts b/src/features/device-state/index.ts index 54610501c..ef7bc9858 100644 --- a/src/features/device-state/index.ts +++ b/src/features/device-state/index.ts @@ -23,9 +23,11 @@ export { serviceInstallFromImage, } from './state-get-utils'; export { - metricsPatchFields, + validDeviceMetricsRecordPatchFields, v2ValidPatchFields, + v2ValidDevicePatchFields, v3ValidPatchFields, + v3ValidDevicePatchFields, } from './state-patch-utils'; const gracefulGet = resolveOrDenyDevicesWithStatus(304); diff --git a/src/features/device-state/routes/state-patch-v2.ts b/src/features/device-state/routes/state-patch-v2.ts index bddea27c6..23abda076 100644 --- a/src/features/device-state/routes/state-patch-v2.ts +++ b/src/features/device-state/routes/state-patch-v2.ts @@ -10,11 +10,12 @@ import { getIP } from '../../../lib/utils'; import type { ImageInstall, PickDeferred } from '../../../balena-model'; import { shouldUpdateMetrics, - metricsPatchFields, - v2ValidPatchFields, upsertImageInstall, deleteOldImageInstalls, truncateShortTextFields, + StatePatchDeviceMetricsRecordBody, + validDeviceMetricsRecordPatchFields, + v2ValidDevicePatchFields, } from '../state-patch-utils'; import type { ResolveDeviceInfoCustomObject } from '../middleware'; @@ -26,7 +27,7 @@ type LocalBody = NonNullable; * These typings should be used as a guide to what should be sent, but cannot be trusted as what actually *is* sent. */ export type StatePatchV2Body = { - local?: { + local?: StatePatchDeviceMetricsRecordBody & { should_be_running__release?: number; name?: string; /** @@ -46,15 +47,6 @@ export type StatePatchV2Body = { download_progress?: number | null; api_port?: number; api_secret?: string; - memory_usage?: number; - memory_total?: number; - storage_block_device?: string; - storage_usage?: number; - storage_total?: number; - cpu_temp?: number; - cpu_usage?: number; - cpu_id?: string; - is_undervolted?: boolean; is_on__commit?: string | null; apps?: Array<{ services?: { @@ -139,20 +131,13 @@ export const statePatchV2: RequestHandler = async (req, res) => { const { apps } = local; let deviceBody: - | Pick & { + | Pick & { is_running__release?: number | null; - } = _.pick(local, v2ValidPatchFields); - let metricsBody: Pick = - _.pick(local, metricsPatchFields); - if ( - Object.keys(metricsBody).length > 0 && - (await shouldUpdateMetrics(uuid)) - ) { - // If we should force a metrics update then merge the two together and clear `metricsBody` so - // that we don't try to merge it again later - deviceBody = { ...deviceBody, ...metricsBody }; - metricsBody = {}; - } + } = _.pick(local, v2ValidDevicePatchFields); + const metricsBody: Pick< + LocalBody, + (typeof validDeviceMetricsRecordPatchFields)[number] + > = _.pick(local, validDeviceMetricsRecordPatchFields); if (local.name != null) { deviceBody.device_name = local.name; @@ -191,15 +176,13 @@ export const statePatchV2: RequestHandler = async (req, res) => { } } } - - updateFns.push(async (resinApiTx) => { - if (Object.keys(deviceBody).length > 0) { + if (Object.keys(deviceBody).length > 0) { + updateFns.push(async (resinApiTx) => { // truncate for resilient legacy compatible device state patch so that supervisors don't fail // to update b/c of length violation of 255 (SBVR SHORT TEXT type) for ip and mac address. // sbvr-types does not export SHORT TEXT VARCHAR length 255 to import. deviceBody = truncateShortTextFields(deviceBody); // If we're updating anyway then ensure the metrics data is included - deviceBody = { ...deviceBody, ...metricsBody }; await resinApiTx.patch({ resource: 'device', id: deviceId, @@ -208,6 +191,32 @@ export const statePatchV2: RequestHandler = async (req, res) => { }, body: deviceBody, }); + }); + } + } + if ( + Object.keys(metricsBody).length > 0 && + (await shouldUpdateMetrics(uuid)) + ) { + updateFns.push(async (resinApiTx) => { + const latestDeviceMetricsRecord = await resinApiTx.get({ + resource: 'device_metrics_record', + id: { is_reported_by__device: deviceId }, + }); + if (latestDeviceMetricsRecord == null) { + await resinApiTx.post({ + resource: 'device_metrics_record', + id: { is_reported_by__device: deviceId }, + body: { + ...metricsBody, + }, + }); + } else { + await resinApiTx.patch({ + resource: 'device_metrics_record', + id: { is_reported_by__device: deviceId }, + body: metricsBody, + }); } }); } diff --git a/src/features/device-state/routes/state-patch-v3.ts b/src/features/device-state/routes/state-patch-v3.ts index 0fa09f55f..d4df480f2 100644 --- a/src/features/device-state/routes/state-patch-v3.ts +++ b/src/features/device-state/routes/state-patch-v3.ts @@ -10,18 +10,21 @@ import { getIP } from '../../../lib/utils'; import { Application, Device, + DeviceMetricsRecord, Image, ImageInstall, PickDeferred, Release, } from '../../../balena-model'; import type { Filter } from 'pinejs-client-core'; -import { metricsPatchFields, v3ValidPatchFields } from '..'; +import { v3ValidDevicePatchFields } from '..'; import { deleteOldImageInstalls, upsertImageInstall, shouldUpdateMetrics, truncateShortTextFields, + StatePatchDeviceMetricsRecordBody, + validDeviceMetricsRecordPatchFields, } from '../state-patch-utils'; const { BadRequestError, UnauthorizedError, InternalRequestError } = errors; @@ -31,7 +34,7 @@ const { api } = sbvrUtils; * These typings should be used as a guide to what should be sent, but cannot be trusted as what actually *is* sent. */ export type StatePatchV3Body = { - [uuid: string]: { + [uuid: string]: StatePatchDeviceMetricsRecordBody & { status?: string; os_version?: string; os_variant?: string; @@ -42,15 +45,7 @@ export type StatePatchV3Body = { mac_address?: string; api_port?: number; api_secret?: string; - memory_usage?: number; - memory_total?: number; - storage_block_device?: string; - storage_usage?: number; - storage_total?: number; - cpu_temp?: number; - cpu_usage?: number; - cpu_id?: string; - is_undervolted?: boolean; + /** * Used for setting dependent devices as online */ @@ -103,11 +98,13 @@ const fetchData = async ( belongs_to__application: { $select: 'uuid', }, + reports__device_metrics_record: {}, }, }, })) as Array< Pick & { belongs_to__application: Array>; + reports__device_metrics_record: DeviceMetricsRecord[]; } >; if (devices.length !== uuids.length) { @@ -269,23 +266,14 @@ export const statePatchV3: RequestHandler = async (req, res) => { let deviceBody: | Pick< StatePatchV3Body[string], - (typeof v3ValidPatchFields)[number] + (typeof v3ValidDevicePatchFields)[number] > & { is_running__release?: number | null; - } = _.pick(state, v3ValidPatchFields); + } = _.pick(state, v3ValidDevicePatchFields); let metricsBody: Pick< StatePatchV3Body[string], - (typeof metricsPatchFields)[number] - > = _.pick(state, metricsPatchFields); - if ( - Object.keys(metricsBody).length > 0 && - (await shouldUpdateMetrics(uuid)) - ) { - // If we should force a metrics update then merge the two together and clear `metricsBody` so - // that we don't try to merge it again later - deviceBody = { ...deviceBody, ...metricsBody }; - metricsBody = {}; - } + (typeof validDeviceMetricsRecordPatchFields)[number] + > = _.pick(state, validDeviceMetricsRecordPatchFields); if (deviceBody.cpu_id != null) { if (/[^\x20-\x7E]/.test(deviceBody.cpu_id)) { @@ -297,12 +285,25 @@ export const statePatchV3: RequestHandler = async (req, res) => { } } - if (apps != null || Object.keys(deviceBody).length > 0) { + if ( + Object.keys(metricsBody).length > 0 && + !(await shouldUpdateMetrics(uuid)) + ) { + // If we don't want to update the metrics then clear them + metricsBody = {}; + } + + if ( + apps != null || + Object.keys(deviceBody).length > 0 || + Object.keys(metricsBody).length > 0 + ) { // We lazily fetch the necessary data only if we absolutely must to avoid unnecessary work if it turns out we don't need it data ??= await fetchData(req, custom, uuids, appReleasesCriteria); const { images, releasesByAppUuid } = data; const device = data.devicesByUuid[uuid]; - + const [latestDeviceMetricsRecord] = + device.reports__device_metrics_record; if (apps != null) { const userAppUuid = device.belongs_to__application[0].uuid; if (releasesByAppUuid[userAppUuid] != null) { @@ -321,7 +322,6 @@ export const statePatchV3: RequestHandler = async (req, res) => { // sbvr-types does not export SHORT TEXT VARCHAR length 255 to import. deviceBody = truncateShortTextFields(deviceBody); // If we're updating anyway then ensure the metrics data is included - deviceBody = { ...deviceBody, ...metricsBody }; updateFns.push(async (resinApiTx) => { await resinApiTx.patch({ resource: 'device', @@ -334,6 +334,26 @@ export const statePatchV3: RequestHandler = async (req, res) => { }); } + if (Object.keys(metricsBody).length > 0) { + updateFns.push(async (resinApiTx) => { + if (latestDeviceMetricsRecord == null) { + await resinApiTx.post({ + resource: 'device_metrics_record', + body: { + ...metricsBody, + ...{ is_reported_by__device: device.id }, + }, + }); + } else { + await resinApiTx.patch({ + resource: 'device_metrics_record', + id: { is_reported_by__device: device.id }, + body: metricsBody, + }); + } + }); + } + if (apps != null) { const imgInstalls: Array<{ imageId: number; diff --git a/src/features/device-state/state-patch-utils.ts b/src/features/device-state/state-patch-utils.ts index 8bdcda56e..541672556 100644 --- a/src/features/device-state/state-patch-utils.ts +++ b/src/features/device-state/state-patch-utils.ts @@ -10,7 +10,32 @@ import { import { createMultiLevelStore } from '../../infra/cache'; import { permissions, sbvrUtils } from '@balena/pinejs'; -export const v3ValidPatchFields: Array< +export type StatePatchDeviceMetricsRecordBody = { + memory_usage?: number; + memory_total?: number; + storage_block_device?: string; + storage_usage?: number; + storage_total?: number; + cpu_temp?: number; + cpu_usage?: number; + cpu_id?: string; + is_undervolted?: boolean; +}; + +export const validDeviceMetricsRecordPatchFields: Array< + keyof StatePatchDeviceMetricsRecordBody +> = [ + 'memory_usage', + 'memory_total', + 'storage_block_device', + 'storage_usage', + 'storage_total', + 'cpu_temp', + 'cpu_usage', + 'is_undervolted', +]; + +export const v3ValidDevicePatchFields: Array< Exclude > = [ 'status', @@ -25,19 +50,26 @@ export const v3ValidPatchFields: Array< 'api_port', 'api_secret', 'cpu_id', - 'is_undervolted', ]; -export const v2ValidPatchFields: Array< +export const v3ValidPatchFields: Array< + Exclude +> = [...v3ValidDevicePatchFields, ...validDeviceMetricsRecordPatchFields]; + +export const v2ValidDevicePatchFields: Array< Exclude, 'apps'> > = [ - ...v3ValidPatchFields, + ...v3ValidDevicePatchFields, 'should_be_running__release', 'device_name', 'note', 'download_progress', ]; +export const v2ValidPatchFields: Array< + Exclude, 'apps'> +> = [...v2ValidDevicePatchFields, ...validDeviceMetricsRecordPatchFields]; + const SHORT_TEXT_LENGTH = 255; const ADDRESS_DELIMITER = ' '; @@ -80,16 +112,6 @@ export const truncateShortTextFields = ( return object; }; -export const metricsPatchFields = [ - 'memory_usage', - 'memory_total', - 'storage_block_device', - 'storage_usage', - 'storage_total', - 'cpu_temp', - 'cpu_usage', -] as const; - export const shouldUpdateMetrics = (() => { const lastMetricsReportTime = createMultiLevelStore( 'lastMetricsReportTime', diff --git a/src/lib/auth.ts b/src/lib/auth.ts index 89d32fa66..454a26b7e 100644 --- a/src/lib/auth.ts +++ b/src/lib/auth.ts @@ -51,6 +51,7 @@ export const ROLES: { 'resin.application_tag.all', 'resin.application_type.all', 'resin.device.all', + 'resin.device_metrics_record.all', 'resin.device.tunnel-22222', 'resin.device_config_variable.all', 'resin.device_environment_variable.all', @@ -79,6 +80,13 @@ export const DEVICE_API_KEY_PERMISSIONS = [ 'resin.device_type.read?describes__device/canAccess()', `resin.device.read?${matchesNonFrozenDeviceActor()}`, `resin.device.update?${matchesNonFrozenDeviceActor()}`, + 'resin.device_metrics_record.read?is_reported_by__device/canAccess()', + `resin.device_metrics_record.create?is_reported_by__device/any(d:${matchesNonFrozenDeviceActor( + 'd', + )})`, + `resin.device_metrics_record.update?is_reported_by__device/any(d:${matchesNonFrozenDeviceActor( + 'd', + )})`, 'resin.application.read?owns__device/canAccess() or (is_public eq true and is_for__device_type/any(dt:dt/describes__device/canAccess()))', 'resin.application_tag.read?application/canAccess()', 'resin.device_config_variable.read?device/canAccess()', diff --git a/src/migrations/00084-split-device-metrics-from-device-schema.sql b/src/migrations/00084-split-device-metrics-from-device-schema.sql new file mode 100644 index 000000000..d44a9741b --- /dev/null +++ b/src/migrations/00084-split-device-metrics-from-device-schema.sql @@ -0,0 +1,19 @@ +CREATE TABLE IF NOT EXISTS "device metrics record" ( + "created at" TIMESTAMP DEFAULT CURRENT_TIMESTAMP NOT NULL +, "modified at" TIMESTAMP DEFAULT CURRENT_TIMESTAMP NOT NULL +, "id" SERIAL NOT NULL PRIMARY KEY +, "is reported by-device" INTEGER NOT NULL UNIQUE +, "memory usage" INTEGER NULL +, "memory total" INTEGER NULL +, "storage block device" VARCHAR(255) NULL +, "storage usage" INTEGER NULL +, "storage total" INTEGER NULL +, "cpu usage" INTEGER NULL +, "cpu temp" INTEGER NULL +, "is undervolted" BOOLEAN DEFAULT FALSE NOT NULL +, FOREIGN KEY ("is reported by-device") REFERENCES "device" ("id") +); + + +CREATE INDEX IF NOT EXISTS "device_metrics_record_by_device_idx" +ON "device metrics record" ("is reported by-device"); diff --git a/src/migrations/00085-split-device-metrics-from-device.async.ts b/src/migrations/00085-split-device-metrics-from-device.async.ts new file mode 100644 index 000000000..9d2ad1682 --- /dev/null +++ b/src/migrations/00085-split-device-metrics-from-device.async.ts @@ -0,0 +1,78 @@ +import type { Migrator } from '@balena/pinejs'; + +const migration: Migrator.AsyncMigration = { + asyncSql: `\ + INSERT INTO "device metrics record" ( + "is reported by-device", + "memory usage", + "memory total", + "storage block device", + "storage usage", + "storage total", + "cpu usage", + "cpu temp", + "is undervolted" + ) + SELECT + "id", + "memory usage", + "memory total", + "storage block device", + "storage usage", + "storage total", + "cpu usage", + "cpu temp", + "is undervolted" + FROM ( + SELECT * FROM "device" + WHERE + "device".id NOT IN ( + SELECT + "is reported by-device" + FROM + "device metrics record" + ) + LIMIT %%ASYNC_BATCH_SIZE%% + FOR UPDATE SKIP LOCKED + ) AS pending;`, + syncSql: `\ + INSERT INTO "device metrics record" ( + "is reported by-device", + "memory usage", + "memory total", + "storage block device", + "storage usage", + "storage total", + "cpu usage", + "cpu temp", + "is undervolted" + ) + SELECT + "id", + "memory usage", + "memory total", + "storage block device", + "storage usage", + "storage total", + "cpu usage", + "cpu temp", + "is undervolted" + FROM ( + SELECT * FROM "device" + WHERE + "device".id NOT IN ( + SELECT + "is reported by-device" + FROM + "device metrics record" + ) + FOR UPDATE SKIP LOCKED + ) AS pending;`, + asyncBatchSize: 10000, + delayMS: 5000, + backoffDelayMS: 60000, + errorThreshold: 15, + finalize: false, +}; + +export default migration; diff --git a/test/03_device-state.ts b/test/03_device-state.ts index eb1a35c30..50eed54d6 100644 --- a/test/03_device-state.ts +++ b/test/03_device-state.ts @@ -15,6 +15,7 @@ import { expectResourceToMatch } from './test-lib/api-helpers'; import { redis, redisRO } from '../src/infra/redis'; import { setTimeout } from 'timers/promises'; import { MINUTES, SECONDS } from '@balena/env-parsing'; +import { validDeviceMetricsRecordPatchFields } from '../src/features/device-state/state-patch-utils'; const { api } = sbvrUtils; @@ -777,7 +778,24 @@ mockery.registerMock('../src/lib/config', configMock); ) : _.pickBy(devicePatchBody[stateKey], (_v, key) => key !== 'name'); - await expectResourceToMatch(pineUser, 'device', device.id, expectedData); + await expectResourceToMatch( + pineUser, + 'device', + device.id, + _.omit(expectedData, validDeviceMetricsRecordPatchFields), + ); + + const expectedMetrics = _.pick( + devicePatchBody[stateKey], + validDeviceMetricsRecordPatchFields, + ); + + await expectResourceToMatch( + pineUser, + 'device_metrics_record', + { is_reported_by__device: device.id }, + expectedMetrics, + ); }); it('should not save an invalid CPU ID', async () => { @@ -891,10 +909,15 @@ mockery.registerMock('../src/lib/config', configMock); ); await setTimeout(200); - await expectResourceToMatch(pineUser, 'device', device.id, { - cpu_usage: 34, - cpu_temp: 56, - }); + await expectResourceToMatch( + pineUser, + 'device_metrics_record', + { is_reported_by__device: device.id }, + { + cpu_usage: 34, + cpu_temp: 56, + }, + ); }); it('should clear the throttling key from redis after the throttling window passes', async () => { @@ -920,8 +943,8 @@ mockery.registerMock('../src/lib/config', configMock); await expectResourceToMatch( pineUser, - 'device', - device.id, + 'device_metrics_record', + { is_reported_by__device: device.id }, devicePatchBody[stateKey], ); }); @@ -953,10 +976,15 @@ mockery.registerMock('../src/lib/config', configMock); stateVersion, ); - await expectResourceToMatch(pineUser, 'device', device.id, { - cpu_usage: 20, - cpu_temp: 20, - }); + await expectResourceToMatch( + pineUser, + 'device_metrics_record', + { is_reported_by__device: device.id }, + { + cpu_usage: 20, + cpu_temp: 20, + }, + ); }); it('should save the update progress of the device state', async () => {