From 25173c4fd9952d61c9a7679668b10d0a17a1773f Mon Sep 17 00:00:00 2001 From: Andrew Patton Date: Fri, 10 May 2024 10:03:31 -0700 Subject: [PATCH] Add sanitization to strip LLM control tokens --- packages/parsing/src/parse-as-json.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/parsing/src/parse-as-json.ts b/packages/parsing/src/parse-as-json.ts index f10a9271..4ec1635d 100644 --- a/packages/parsing/src/parse-as-json.ts +++ b/packages/parsing/src/parse-as-json.ts @@ -183,13 +183,16 @@ const getObjectKeyFromIndex = (index: number) => const OBJECT_KEY_REGEXP = /^\s*"[^"]+":/; +const CONTROL_TOKENS_REGEXP = /(^<\|im_start\|>|<\|im_end\|>$)/; + type ParsedValue = string | boolean | number | GenericObject | Array; export function parseAsJSON(text: string): ParsedValue | null { // if the input is undefined/null, return null to indicate failure if (text == null) return null; - // attempt to parse the string as-is + text = text.replace(CONTROL_TOKENS_REGEXP, ''); + // attempt to parse the string as-is (minus control tokens) try { return JSON.parse(text) as ParsedValue; } catch (error) {