diff --git a/plugin-server/src/worker/ingestion/process-event.ts b/plugin-server/src/worker/ingestion/process-event.ts index 73e8249626a7b..cc488f494e796 100644 --- a/plugin-server/src/worker/ingestion/process-event.ts +++ b/plugin-server/src/worker/ingestion/process-event.ts @@ -1,6 +1,7 @@ import ClickHouse from '@posthog/clickhouse' import { PluginEvent, Properties } from '@posthog/plugin-scaffold' import * as Sentry from '@sentry/node' +import { captureException } from '@sentry/node' import { DateTime } from 'luxon' import { activeMilliseconds } from '../../main/ingestion-queues/session-recording/snapshot-segmenter' @@ -306,21 +307,21 @@ export type ConsoleLogEntry = { timestamp: ClickHouseTimestamp } +function sanitizeForUTF8(input: string): string { + // the JS console truncates some logs... + // when it does that it doesn't check if the output is valid UTF-8 + // and so it can truncate half way through a UTF-16 pair 🤷 + // the simplest way to fix this is to convert to a buffer and back + // annoyingly Node 20 has `toWellFormed` which might have been useful + const buffer = Buffer.from(input) + return buffer.toString() +} + function safeString(payload: (string | null)[]) { // the individual strings are sometimes wrapped in quotes... we want to strip those return payload .filter((item): item is string => !!item && typeof item === 'string') - .map((item) => { - let candidate = item - if (candidate.startsWith('"') || candidate.startsWith("'")) { - candidate = candidate.substring(1) - } - - if (candidate.endsWith('"') || candidate.endsWith("'")) { - candidate = candidate.substring(0, candidate.length - 1) - } - return candidate - }) + .map((item) => sanitizeForUTF8(item.substring(0, 2999))) .join(' ') } @@ -362,18 +363,23 @@ export const gatherConsoleLogEvents = ( // it should be unnecessary to check for truthiness of event here, // but we've seen null in production so 🤷 if (!!event && event.type === RRWebEventType.Plugin && event.data?.plugin === 'rrweb/console@1') { - const level = event.data.payload?.level - const message = safeString(event.data.payload?.payload) - consoleLogEntries.push({ - team_id, - // TODO when is it not a single item array? - message: message, - log_level: level, - log_source: 'session_replay', - log_source_id: session_id, - instance_id: null, - timestamp: castTimestampOrNow(DateTime.fromMillis(event.timestamp), TimestampFormat.ClickHouse), - }) + try { + const level = event.data.payload?.level + const message = safeString(event.data.payload?.payload) + consoleLogEntries.push({ + team_id, + // TODO when is it not a single item array? + message: message, + log_level: level, + log_source: 'session_replay', + log_source_id: session_id, + instance_id: null, + timestamp: castTimestampOrNow(DateTime.fromMillis(event.timestamp), TimestampFormat.ClickHouse), + }) + } catch (e) { + // if we can't process a console log, we don't want to lose the whole shebang + captureException(e, { extra: { messagePayload: event.data.payload?.payload }, tags: { session_id } }) + } } }) diff --git a/plugin-server/tests/main/process-event.test.ts b/plugin-server/tests/main/process-event.test.ts index 94587c6770773..d067b92fed75c 100644 --- a/plugin-server/tests/main/process-event.test.ts +++ b/plugin-server/tests/main/process-event.test.ts @@ -1484,20 +1484,46 @@ test(`snapshot event with no event summary timestamps is ignored`, () => { }).toThrowError() }) -test('simple console log processing', () => { - const consoleLogEntries = gatherConsoleLogEvents(team.id, 'session_id', [ - { - timestamp: 1682449093469, - type: 6, - data: { - plugin: 'rrweb/console@1', - payload: { - level: 'info', - payload: ['the message', 'more strings', '', null, false, 0, { blah: 'wat' }], - }, +function consoleMessageFor(payload: any[]) { + return { + timestamp: 1682449093469, + type: 6, + data: { + plugin: 'rrweb/console@1', + payload: { + level: 'info', + payload: payload, }, }, - null as unknown as RRWebEvent, // see https://posthog.sentry.io/issues/4525043303 + } +} + +test.each([ + { + payload: ['the message', 'more strings', '', null, false, 0, { blah: 'wat' }], + expectedMessage: 'the message more strings', + }, + { + // lone surrogate pairs are replaced with the "unknown" character + payload: ['\\\\\\",\\\\\\"emoji_flag\\\\\\":\\\\\\"\ud83c...[truncated]'], + expectedMessage: '\\\\\\",\\\\\\"emoji_flag\\\\\\":\\\\\\"\ufffd...[truncated]', + }, + { + // sometimes the strings are wrapped in quotes... + payload: ['"test"'], + expectedMessage: '"test"', + }, + { + // let's not accept arbitrary length content + payload: [new Array(3001).join('a')], + expectedMessage: new Array(3000).join('a'), + }, +])('simple console log processing', ({ payload, expectedMessage }) => { + const consoleLogEntries = gatherConsoleLogEvents(team.id, 'session_id', [ + consoleMessageFor(payload), + // see https://posthog.sentry.io/issues/4525043303 + // null events always ignored + null as unknown as RRWebEvent, ]) expect(consoleLogEntries).toEqual([ { @@ -1507,7 +1533,7 @@ test('simple console log processing', () => { log_source_id: 'session_id', instance_id: null, timestamp: castTimestampToClickhouseFormat(DateTime.fromMillis(1682449093469), TimestampFormat.ClickHouse), - message: 'the message more strings', + message: expectedMessage, } satisfies ConsoleLogEntry, ]) })