diff --git a/docs/reference/ingest/processors/attachment.asciidoc b/docs/reference/ingest/processors/attachment.asciidoc index fd2866906c1d0..bd5b8db562ae2 100644 --- a/docs/reference/ingest/processors/attachment.asciidoc +++ b/docs/reference/ingest/processors/attachment.asciidoc @@ -19,15 +19,15 @@ representation. The processor will skip the base64 decoding then. .Attachment options [options="header"] |====== -| Name | Required | Default | Description -| `field` | yes | - | The field to get the base64 encoded field from -| `target_field` | no | attachment | The field that will hold the attachment information -| `indexed_chars` | no | 100000 | The number of chars being used for extraction to prevent huge fields. Use `-1` for no limit. -| `indexed_chars_field` | no | `null` | Field name from which you can overwrite the number of chars being used for extraction. See `indexed_chars`. -| `properties` | no | all properties | Array of properties to select to be stored. Can be `content`, `title`, `name`, `author`, `keywords`, `date`, `content_type`, `content_length`, `language` -| `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document -| `remove_binary` | no | `false` | If `true`, the binary `field` will be removed from the document -| `resource_name` | no | | Field containing the name of the resource to decode. If specified, the processor passes this resource name to the underlying Tika library to enable https://tika.apache.org/1.24.1/detection.html#Resource_Name_Based_Detection[Resource Name Based Detection]. +| Name | Required | Default | Description +| `field` | yes | - | The field to get the base64 encoded field from +| `target_field` | no | attachment | The field that will hold the attachment information +| `indexed_chars` | no | 100000 | The number of chars being used for extraction to prevent huge fields. Use `-1` for no limit. +| `indexed_chars_field` | no | `null` | Field name from which you can overwrite the number of chars being used for extraction. See `indexed_chars`. +| `properties` | no | all properties | Array of properties to select to be stored. Can be `content`, `title`, `name`, `author`, `keywords`, `date`, `content_type`, `content_length`, `language` +| `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document +| `remove_binary` | encouraged | `false` | If `true`, the binary `field` will be removed from the document. This option is not required, but setting it explicitly is encouraged, and omitting it will result in a warning. +| `resource_name` | no | | Field containing the name of the resource to decode. If specified, the processor passes this resource name to the underlying Tika library to enable https://tika.apache.org/1.24.1/detection.html#Resource_Name_Based_Detection[Resource Name Based Detection]. |====== [discrete] @@ -58,7 +58,7 @@ PUT _ingest/pipeline/attachment { "attachment" : { "field" : "data", - "remove_binary": false + "remove_binary": true } } ] @@ -82,7 +82,6 @@ The document's `attachment` object contains extracted properties for the file: "_seq_no": 22, "_primary_term": 1, "_source": { - "data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=", "attachment": { "content_type": "application/rtf", "language": "ro", @@ -94,9 +93,6 @@ The document's `attachment` object contains extracted properties for the file: ---- // TESTRESPONSE[s/"_seq_no": \d+/"_seq_no" : $body._seq_no/ s/"_primary_term" : 1/"_primary_term" : $body._primary_term/] -NOTE: Keeping the binary as a field within the document might consume a lot of resources. It is highly recommended - to remove that field from the document. Set `remove_binary` to `true` to automatically remove the field. - [[attachment-fields]] ==== Exported fields @@ -143,7 +139,7 @@ PUT _ingest/pipeline/attachment "attachment" : { "field" : "data", "properties": [ "content", "title" ], - "remove_binary": false + "remove_binary": true } } ] @@ -154,6 +150,59 @@ NOTE: Extracting contents from binary data is a resource intensive operation and consumes a lot of resources. It is highly recommended to run pipelines using this processor in a dedicated ingest node. +[[attachment-keep-binary]] +==== Keeping the attachment binary + +Keeping the binary as a field within the document might consume a lot of resources. It is highly recommended to remove +that field from the document, by setting `remove_binary` to `true` to automatically remove the field, as in the other +examples shown on this page. If you _do_ want to keep the binary field, explicitly set `remove_binary` to `false` to +avoid the warning you get from omitting it: + +[source,console] +---- +PUT _ingest/pipeline/attachment +{ + "description" : "Extract attachment information including original binary", + "processors" : [ + { + "attachment" : { + "field" : "data", + "remove_binary": false + } + } + ] +} +PUT my-index-000001/_doc/my_id?pipeline=attachment +{ + "data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=" +} +GET my-index-000001/_doc/my_id +---- + +The document's `_source` object includes the original binary field: + +[source,console-result] +---- +{ + "found": true, + "_index": "my-index-000001", + "_id": "my_id", + "_version": 1, + "_seq_no": 22, + "_primary_term": 1, + "_source": { + "data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=", + "attachment": { + "content_type": "application/rtf", + "language": "ro", + "content": "Lorem ipsum dolor sit amet", + "content_length": 28 + } + } +} +---- +// TESTRESPONSE[s/"_seq_no": \d+/"_seq_no" : $body._seq_no/ s/"_primary_term" : 1/"_primary_term" : $body._primary_term/] + [[attachment-cbor]] ==== Use the attachment processor with CBOR @@ -170,7 +219,7 @@ PUT _ingest/pipeline/cbor-attachment { "attachment" : { "field" : "data", - "remove_binary": false + "remove_binary": true } } ] @@ -226,7 +275,7 @@ PUT _ingest/pipeline/attachment "field" : "data", "indexed_chars" : 11, "indexed_chars_field" : "max_size", - "remove_binary": false + "remove_binary": true } } ] @@ -250,7 +299,6 @@ Returns this: "_seq_no": 35, "_primary_term": 1, "_source": { - "data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=", "attachment": { "content_type": "application/rtf", "language": "is", @@ -274,7 +322,7 @@ PUT _ingest/pipeline/attachment "field" : "data", "indexed_chars" : 11, "indexed_chars_field" : "max_size", - "remove_binary": false + "remove_binary": true } } ] @@ -299,7 +347,6 @@ Returns this: "_seq_no": 40, "_primary_term": 1, "_source": { - "data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=", "max_size": 5, "attachment": { "content_type": "application/rtf", @@ -358,7 +405,7 @@ PUT _ingest/pipeline/attachment "attachment": { "target_field": "_ingest._value.attachment", "field": "_ingest._value.data", - "remove_binary": false + "remove_binary": true } } } @@ -396,7 +443,6 @@ Returns this: "attachments" : [ { "filename" : "ipsum.txt", - "data" : "dGhpcyBpcwpqdXN0IHNvbWUgdGV4dAo=", "attachment" : { "content_type" : "text/plain; charset=ISO-8859-1", "language" : "en", @@ -406,7 +452,6 @@ Returns this: }, { "filename" : "test.txt", - "data" : "VGhpcyBpcyBhIHRlc3QK", "attachment" : { "content_type" : "text/plain; charset=ISO-8859-1", "language" : "en", diff --git a/modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java b/modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java index 007fe39d72e61..83a7bdf7e224a 100644 --- a/modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java +++ b/modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java @@ -18,7 +18,7 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.logging.DeprecationCategory; import org.elasticsearch.common.logging.DeprecationLogger; -import org.elasticsearch.core.UpdateForV9; +import org.elasticsearch.core.UpdateForV10; import org.elasticsearch.ingest.AbstractProcessor; import org.elasticsearch.ingest.IngestDocument; import org.elasticsearch.ingest.Processor; @@ -196,7 +196,7 @@ public IngestDocument execute(IngestDocument ingestDocument) { * @param property property to add * @param value value to add */ - private <T> void addAdditionalField(Map<String, Object> additionalFields, Property property, String value) { + private void addAdditionalField(Map<String, Object> additionalFields, Property property, String value) { if (properties.contains(property) && Strings.hasLength(value)) { additionalFields.put(property.toLowerCase(), value); } @@ -233,7 +233,7 @@ public AttachmentProcessor create( String processorTag, String description, Map<String, Object> config - ) throws Exception { + ) { String field = readStringProperty(TYPE, processorTag, config, "field"); String resourceName = readOptionalStringProperty(TYPE, processorTag, config, "resource_name"); String targetField = readStringProperty(TYPE, processorTag, config, "target_field", "attachment"); @@ -241,8 +241,8 @@ public AttachmentProcessor create( int indexedChars = readIntProperty(TYPE, processorTag, config, "indexed_chars", NUMBER_OF_CHARS_INDEXED); boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false); String indexedCharsField = readOptionalStringProperty(TYPE, processorTag, config, "indexed_chars_field"); - @UpdateForV9(owner = UpdateForV9.Owner.DATA_MANAGEMENT) - // update the [remove_binary] default to be 'true' assuming enough time has passed. Deprecated in September 2022. + @UpdateForV10(owner = UpdateForV10.Owner.DATA_MANAGEMENT) + // Revisit whether we want to update the [remove_binary] default to be 'true' - would need to find a way to do this safely Boolean removeBinary = readOptionalBooleanProperty(TYPE, processorTag, config, "remove_binary"); if (removeBinary == null) { DEPRECATION_LOGGER.warn(