From ebc64bc7f861fcc936d8009f0a8a39421da3a8e6 Mon Sep 17 00:00:00 2001 From: Pete Gillin Date: Fri, 6 Dec 2024 18:58:31 +0000 Subject: [PATCH] Update `UpdateForV9` in `AttachmentProcessor` We are not going to make this change in V9. We may do it in V10. This change just bumps the annotation to remind us to revisit. Since we are living with this for a while, it seems worth improving the documentation. This now encourages explicitly setting the option one way or the other, since you get a warning if you omit it. It also changes the existing examples to use true rather than false, as that's our recommendation. And it adds a new section with an example where it's true, and moves the content previously in a note into that section. --- .../ingest/processors/attachment.asciidoc | 91 ++++++++++++++----- .../attachment/AttachmentProcessor.java | 10 +- 2 files changed, 73 insertions(+), 28 deletions(-) diff --git a/docs/reference/ingest/processors/attachment.asciidoc b/docs/reference/ingest/processors/attachment.asciidoc index fd2866906c1d0..bd5b8db562ae2 100644 --- a/docs/reference/ingest/processors/attachment.asciidoc +++ b/docs/reference/ingest/processors/attachment.asciidoc @@ -19,15 +19,15 @@ representation. The processor will skip the base64 decoding then. .Attachment options [options="header"] |====== -| Name | Required | Default | Description -| `field` | yes | - | The field to get the base64 encoded field from -| `target_field` | no | attachment | The field that will hold the attachment information -| `indexed_chars` | no | 100000 | The number of chars being used for extraction to prevent huge fields. Use `-1` for no limit. -| `indexed_chars_field` | no | `null` | Field name from which you can overwrite the number of chars being used for extraction. See `indexed_chars`. -| `properties` | no | all properties | Array of properties to select to be stored. Can be `content`, `title`, `name`, `author`, `keywords`, `date`, `content_type`, `content_length`, `language` -| `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document -| `remove_binary` | no | `false` | If `true`, the binary `field` will be removed from the document -| `resource_name` | no | | Field containing the name of the resource to decode. If specified, the processor passes this resource name to the underlying Tika library to enable https://tika.apache.org/1.24.1/detection.html#Resource_Name_Based_Detection[Resource Name Based Detection]. +| Name | Required | Default | Description +| `field` | yes | - | The field to get the base64 encoded field from +| `target_field` | no | attachment | The field that will hold the attachment information +| `indexed_chars` | no | 100000 | The number of chars being used for extraction to prevent huge fields. Use `-1` for no limit. +| `indexed_chars_field` | no | `null` | Field name from which you can overwrite the number of chars being used for extraction. See `indexed_chars`. +| `properties` | no | all properties | Array of properties to select to be stored. Can be `content`, `title`, `name`, `author`, `keywords`, `date`, `content_type`, `content_length`, `language` +| `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document +| `remove_binary` | encouraged | `false` | If `true`, the binary `field` will be removed from the document. This option is not required, but setting it explicitly is encouraged, and omitting it will result in a warning. +| `resource_name` | no | | Field containing the name of the resource to decode. If specified, the processor passes this resource name to the underlying Tika library to enable https://tika.apache.org/1.24.1/detection.html#Resource_Name_Based_Detection[Resource Name Based Detection]. |====== [discrete] @@ -58,7 +58,7 @@ PUT _ingest/pipeline/attachment { "attachment" : { "field" : "data", - "remove_binary": false + "remove_binary": true } } ] @@ -82,7 +82,6 @@ The document's `attachment` object contains extracted properties for the file: "_seq_no": 22, "_primary_term": 1, "_source": { - "data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=", "attachment": { "content_type": "application/rtf", "language": "ro", @@ -94,9 +93,6 @@ The document's `attachment` object contains extracted properties for the file: ---- // TESTRESPONSE[s/"_seq_no": \d+/"_seq_no" : $body._seq_no/ s/"_primary_term" : 1/"_primary_term" : $body._primary_term/] -NOTE: Keeping the binary as a field within the document might consume a lot of resources. It is highly recommended - to remove that field from the document. Set `remove_binary` to `true` to automatically remove the field. - [[attachment-fields]] ==== Exported fields @@ -143,7 +139,7 @@ PUT _ingest/pipeline/attachment "attachment" : { "field" : "data", "properties": [ "content", "title" ], - "remove_binary": false + "remove_binary": true } } ] @@ -154,6 +150,59 @@ NOTE: Extracting contents from binary data is a resource intensive operation and consumes a lot of resources. It is highly recommended to run pipelines using this processor in a dedicated ingest node. +[[attachment-keep-binary]] +==== Keeping the attachment binary + +Keeping the binary as a field within the document might consume a lot of resources. It is highly recommended to remove +that field from the document, by setting `remove_binary` to `true` to automatically remove the field, as in the other +examples shown on this page. If you _do_ want to keep the binary field, explicitly set `remove_binary` to `false` to +avoid the warning you get from omitting it: + +[source,console] +---- +PUT _ingest/pipeline/attachment +{ + "description" : "Extract attachment information including original binary", + "processors" : [ + { + "attachment" : { + "field" : "data", + "remove_binary": false + } + } + ] +} +PUT my-index-000001/_doc/my_id?pipeline=attachment +{ + "data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=" +} +GET my-index-000001/_doc/my_id +---- + +The document's `_source` object includes the original binary field: + +[source,console-result] +---- +{ + "found": true, + "_index": "my-index-000001", + "_id": "my_id", + "_version": 1, + "_seq_no": 22, + "_primary_term": 1, + "_source": { + "data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=", + "attachment": { + "content_type": "application/rtf", + "language": "ro", + "content": "Lorem ipsum dolor sit amet", + "content_length": 28 + } + } +} +---- +// TESTRESPONSE[s/"_seq_no": \d+/"_seq_no" : $body._seq_no/ s/"_primary_term" : 1/"_primary_term" : $body._primary_term/] + [[attachment-cbor]] ==== Use the attachment processor with CBOR @@ -170,7 +219,7 @@ PUT _ingest/pipeline/cbor-attachment { "attachment" : { "field" : "data", - "remove_binary": false + "remove_binary": true } } ] @@ -226,7 +275,7 @@ PUT _ingest/pipeline/attachment "field" : "data", "indexed_chars" : 11, "indexed_chars_field" : "max_size", - "remove_binary": false + "remove_binary": true } } ] @@ -250,7 +299,6 @@ Returns this: "_seq_no": 35, "_primary_term": 1, "_source": { - "data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=", "attachment": { "content_type": "application/rtf", "language": "is", @@ -274,7 +322,7 @@ PUT _ingest/pipeline/attachment "field" : "data", "indexed_chars" : 11, "indexed_chars_field" : "max_size", - "remove_binary": false + "remove_binary": true } } ] @@ -299,7 +347,6 @@ Returns this: "_seq_no": 40, "_primary_term": 1, "_source": { - "data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=", "max_size": 5, "attachment": { "content_type": "application/rtf", @@ -358,7 +405,7 @@ PUT _ingest/pipeline/attachment "attachment": { "target_field": "_ingest._value.attachment", "field": "_ingest._value.data", - "remove_binary": false + "remove_binary": true } } } @@ -396,7 +443,6 @@ Returns this: "attachments" : [ { "filename" : "ipsum.txt", - "data" : "dGhpcyBpcwpqdXN0IHNvbWUgdGV4dAo=", "attachment" : { "content_type" : "text/plain; charset=ISO-8859-1", "language" : "en", @@ -406,7 +452,6 @@ Returns this: }, { "filename" : "test.txt", - "data" : "VGhpcyBpcyBhIHRlc3QK", "attachment" : { "content_type" : "text/plain; charset=ISO-8859-1", "language" : "en", diff --git a/modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java b/modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java index 007fe39d72e61..83a7bdf7e224a 100644 --- a/modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java +++ b/modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java @@ -18,7 +18,7 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.logging.DeprecationCategory; import org.elasticsearch.common.logging.DeprecationLogger; -import org.elasticsearch.core.UpdateForV9; +import org.elasticsearch.core.UpdateForV10; import org.elasticsearch.ingest.AbstractProcessor; import org.elasticsearch.ingest.IngestDocument; import org.elasticsearch.ingest.Processor; @@ -196,7 +196,7 @@ public IngestDocument execute(IngestDocument ingestDocument) { * @param property property to add * @param value value to add */ - private void addAdditionalField(Map additionalFields, Property property, String value) { + private void addAdditionalField(Map additionalFields, Property property, String value) { if (properties.contains(property) && Strings.hasLength(value)) { additionalFields.put(property.toLowerCase(), value); } @@ -233,7 +233,7 @@ public AttachmentProcessor create( String processorTag, String description, Map config - ) throws Exception { + ) { String field = readStringProperty(TYPE, processorTag, config, "field"); String resourceName = readOptionalStringProperty(TYPE, processorTag, config, "resource_name"); String targetField = readStringProperty(TYPE, processorTag, config, "target_field", "attachment"); @@ -241,8 +241,8 @@ public AttachmentProcessor create( int indexedChars = readIntProperty(TYPE, processorTag, config, "indexed_chars", NUMBER_OF_CHARS_INDEXED); boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false); String indexedCharsField = readOptionalStringProperty(TYPE, processorTag, config, "indexed_chars_field"); - @UpdateForV9(owner = UpdateForV9.Owner.DATA_MANAGEMENT) - // update the [remove_binary] default to be 'true' assuming enough time has passed. Deprecated in September 2022. + @UpdateForV10(owner = UpdateForV10.Owner.DATA_MANAGEMENT) + // Revisit whether we want to update the [remove_binary] default to be 'true' - would need to find a way to do this safely Boolean removeBinary = readOptionalBooleanProperty(TYPE, processorTag, config, "remove_binary"); if (removeBinary == null) { DEPRECATION_LOGGER.warn(