From 7acd0fca74433e0f90619bc9c9830a03d9363d88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Vil=C3=A1?= Date: Wed, 5 Jun 2024 09:31:58 -0500 Subject: [PATCH] Add text embedding processor (#304) * Add text embedding processor Signed-off-by: miguel-vila --- spec/schemas/ingest._common.yaml | 23 +++++++++++++++++-- tests/text_embedding_processor.yaml | 35 +++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 tests/text_embedding_processor.yaml diff --git a/spec/schemas/ingest._common.yaml b/spec/schemas/ingest._common.yaml index 944c846a5..b118be853 100644 --- a/spec/schemas/ingest._common.yaml +++ b/spec/schemas/ingest._common.yaml @@ -28,8 +28,6 @@ components: $ref: '_common.yaml#/components/schemas/VersionNumber' _meta: $ref: '_common.yaml#/components/schemas/Metadata' - required: - - _meta ProcessorContainer: type: object properties: @@ -101,6 +99,8 @@ components: $ref: '#/components/schemas/CircleProcessor' inference: $ref: '#/components/schemas/InferenceProcessor' + text_embedding: + $ref: '#/components/schemas/TextEmbeddingProcessor' minProperties: 1 maxProperties: 1 AttachmentProcessor: @@ -870,3 +870,22 @@ components: Specifies the type of the predicted field to write. Valid values are: `string`, `number`, `boolean`. type: string + TextEmbeddingProcessor: + allOf: + - $ref: '#/components/schemas/ProcessorBase' + - type: object + properties: + model_id: + $ref: '_common.yaml#/components/schemas/Id' + field_map: + description: |- + Contains key-value pairs that specify the mapping of a text field to a vector field. + type: object + additionalProperties: + type: string + description: + type: string + description: A brief description of the processor. + required: + - model_id + - field_map diff --git a/tests/text_embedding_processor.yaml b/tests/text_embedding_processor.yaml new file mode 100644 index 000000000..ac1a98550 --- /dev/null +++ b/tests/text_embedding_processor.yaml @@ -0,0 +1,35 @@ +$schema: ../json_schemas/test_story.schema.yaml + +skip: false +description: | + This test story checks that we can create an ingest pipeline with a text + embedding processor +epilogues: + - path: /_ingest/pipeline/books_pipeline + method: DELETE + status: [200, 404] +chapters: + - synopsis: Create ingest pipeline for text embedding + path: /_ingest/pipeline/{id} + method: PUT + parameters: + id: books_pipeline + request_body: + payload: + description: "Extracts text from field and embeds it" + processors: + - text_embedding: + model_id: "text-embedding-model" + field_map: + text: "passage_embedding" + response: + status: 200 + payload: + acknowledged: true + - synopsis: Query created pipeline + path: /_ingest/pipeline/{id} + method: GET + parameters: + id: books_pipeline + response: + status: 200