Add find field structure and find messages structure APIs (#3346) (#3384

) Co-authored-by: Florian Bernd <[email protected]> (cherry picked from commit 4b93d7f) Co-authored-by: Lisa Cawley <[email protected]>
elastic · Dec 19, 2024 · ef9828c · ef9828c
1 parent a9c3d0e
commit ef9828c
Show file tree

Hide file tree

Showing 12 changed files with 2,148 additions and 263 deletions.
diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json
diff --git a/output/schema/schema.json b/output/schema/schema.json
diff --git a/output/schema/validation-errors.json b/output/schema/validation-errors.json
diff --git a/output/typescript/types.ts b/output/typescript/types.ts
diff --git a/specification/_doc_ids/table.csv b/specification/_doc_ids/table.csv
@@ -168,6 +168,8 @@ explain-dfanalytics,https://www.elastic.co/guide/en/elasticsearch/reference/{bra
 fail-processor,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/fail-processor.html
 field-and-document-access-control,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/field-and-document-access-control.html
 field-usage-stats,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/field-usage-stats.html
+find-field-structure,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/find-field-structure.html
+find-message-structure,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/find-message-structure.html
 find-structure,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/find-structure.html
 fingerprint-processor,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/fingerprint-processor.html
 foreach-processor,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/foreach-processor.html

diff --git a/...on/text_structure/find_structure/types.ts → ...cation/text_structure/_types/Structure.ts b/...on/text_structure/find_structure/types.ts → ...cation/text_structure/_types/Structure.ts
@@ -36,3 +36,15 @@ export class TopHit {
   count: long
   value: UserDefinedValue
 }
+
+export enum EcsCompatibilityType {
+  disabled,
+  v1
+}
+
+export enum FormatType {
+  delimited,
+  ndjson,
+  semi_structured_text,
+  xml
+}
diff --git a/specification/text_structure/find_field_structure/FindFieldStructureRequest.ts b/specification/text_structure/find_field_structure/FindFieldStructureRequest.ts
@@ -0,0 +1,162 @@
+/*
+ * Licensed to Elasticsearch B.V. under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch B.V. licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { RequestBase } from '@_types/Base'
+import { Field, GrokPattern, IndexName } from '@_types/common'
+import { uint } from '@_types/Numeric'
+import { Duration } from '@_types/Time'
+import { EcsCompatibilityType, FormatType } from '../_types/Structure'
+
+/**
+ * Find the structure of a text field.
+ * Find the structure of a text field in an Elasticsearch index.
+ * @rest_spec_name text_structure.find_field_structure
+ * @availability stack stability=stable visibility=public
+ * @cluster_privileges monitor_text_structure
+ * @doc_id find-field-structure
+ */
+interface Request extends RequestBase {
+  query_parameters: {
+    /**
+     * If `format` is set to `delimited`, you can specify the column names in a comma-separated list.
+     * If this parameter is not specified, the structure finder uses the column names from the header row of the text.
+     * If the text does not have a header row, columns are named "column1", "column2", "column3", for example.
+     */
+    column_names?: string
+    /**
+     * If you have set `format` to `delimited`, you can specify the character used to delimit the values in each row.
+     * Only a single character is supported; the delimiter cannot have multiple characters.
+     * By default, the API considers the following possibilities: comma, tab, semi-colon, and pipe (`|`).
+     * In this default scenario, all rows must have the same number of fields for the delimited format to be detected.
+     * If you specify a delimiter, up to 10% of the rows can have a different number of columns than the first row.
+     */
+    delimiter?: string
+    /**
+     * The number of documents to include in the structural analysis.
+     * The minimum value is 2.
+     * @server_default 1000
+     */
+    documents_to_sample?: uint
+    /**
+     * The mode of compatibility with ECS compliant Grok patterns.
+     * Use this parameter to specify whether to use ECS Grok patterns instead of legacy ones when the structure finder creates a Grok pattern.
+     * This setting primarily has an impact when a whole message Grok pattern such as `%{CATALINALOG}` matches the input.
+     * If the structure finder identifies a common structure but has no idea of the meaning then generic field names such as `path`, `ipaddress`, `field1`, and `field2` are used in the `grok_pattern` output.
+     * The intention in that situation is that a user who knows the meanings will rename the fields before using them.
+     * @server_default disabled
+     */
+    ecs_compatibility?: EcsCompatibilityType
+    /**
+     * If true, the response includes a field named `explanation`, which is an array of strings that indicate how the structure finder produced its result.
+     * @server_default false
+     */
+    explain?: boolean
+    /**
+     * The field that should be analyzed.
+     */
+    field: Field
+    /**
+     * The high level structure of the text.
+     * By default, the API chooses the format.
+     * In this default scenario, all rows must have the same number of fields for a delimited format to be detected.
+     * If the format is set to delimited and the delimiter is not set, however, the API tolerates up to 5% of rows that have a different number of columns than the first row.
+     */
+    format?: FormatType
+    /**
+     * If the format is `semi_structured_text`, you can specify a Grok pattern that is used to extract fields from every message in the text.
+     * The name of the timestamp field in the Grok pattern must match what is specified in the `timestamp_field` parameter.
+     * If that parameter is not specified, the name of the timestamp field in the Grok pattern must match "timestamp".
+     * If `grok_pattern` is not specified, the structure finder creates a Grok pattern.
+     */
+    grok_pattern?: GrokPattern
+    /**
+     * The name of the index that contains the analyzed field.
+     */
+    index: IndexName
+    /**
+     * If the format is `delimited`, you can specify the character used to quote the values in each row if they contain newlines or the delimiter character.
+     * Only a single character is supported.
+     * If this parameter is not specified, the default value is a double quote (`"`).
+     * If your delimited text format does not use quoting, a workaround is to set this argument to a character that does not appear anywhere in the sample.
+     */
+    quote?: string
+    /**
+     * If the format is `delimited`, you can specify whether values between delimiters should have whitespace trimmed from them.
+     * If this parameter is not specified and the delimiter is pipe (`|`), the default value is true.
+     * Otherwise, the default value is false.
+     */
+    should_trim_fields?: boolean
+    /**
+     * The maximum amount of time that the structure analysis can take.
+     * If the analysis is still running when the timeout expires, it will be stopped.
+     * @server_default 25s
+     */
+    timeout?: Duration
+    /**
+     * The name of the field that contains the primary timestamp of each record in the text.
+     * In particular, if the text was ingested into an index, this is the field that would be used to populate the `@timestamp` field.
+     *
+     * If the format is `semi_structured_text`, this field must match the name of the appropriate extraction in the `grok_pattern`.
+     * Therefore, for semi-structured text, it is best not to specify this parameter unless `grok_pattern` is also specified.
+     *
+     * For structured text, if you specify this parameter, the field must exist within the text.
+     *
+     * If this parameter is not specified, the structure finder makes a decision about which field (if any) is the primary timestamp field.
+     * For structured text, it is not compulsory to have a timestamp in the text.
+     */
+    timestamp_field?: Field
+    /**
+     * The Java time format of the timestamp field in the text.
+     * Only a subset of Java time format letter groups are supported:
+     *
+     * * `a`
+     * * `d`
+     * * `dd`
+     * * `EEE`
+     * * `EEEE`
+     * * `H`
+     * * `HH`
+     * * `h`
+     * * `M`
+     * * `MM`
+     * * `MMM`
+     * * `MMMM`
+     * * `mm`
+     * * `ss`
+     * * `XX`
+     * * `XXX`
+     * * `yy`
+     * * `yyyy`
+     * * `zzz`
+     *
+     * Additionally `S` letter groups (fractional seconds) of length one to nine are supported providing they occur after `ss` and are separated from the `ss` by a period (`.`), comma (`,`), or colon (`:`).
+     * Spacing and punctuation is also permitted with the exception a question mark (`?`), newline, and carriage return, together with literal text enclosed in single quotes.
+     * For example, `MM/dd HH.mm.ss,SSSSSS 'in' yyyy` is a valid override format.
+     *
+     * One valuable use case for this parameter is when the format is semi-structured text, there are multiple timestamp formats in the text, and you know which format corresponds to the primary timestamp, but you do not want to specify the full `grok_pattern`.
+     * Another is when the timestamp format is one that the structure finder does not consider by default.
+     *
+     * If this parameter is not specified, the structure finder chooses the best format from a built-in set.
+     *
+     * If the special value `null` is specified, the structure finder will not look for a primary timestamp in the text.
+     * When the format is semi-structured text, this will result in the structure finder treating the text as single-line messages.
+     */
+    timestamp_format?: string
+  }
+}
diff --git a/specification/text_structure/find_field_structure/FindFieldStructureResponse.ts b/specification/text_structure/find_field_structure/FindFieldStructureResponse.ts
@@ -0,0 +1,49 @@
+/*
+ * Licensed to Elasticsearch B.V. under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch B.V. licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { PipelineConfig } from '@ingest/_types/Pipeline'
+import { Dictionary } from '@spec_utils/Dictionary'
+import { Field, GrokPattern } from '@_types/common'
+import { TypeMapping } from '@_types/mapping/TypeMapping'
+import { integer } from '@_types/Numeric'
+import {
+  EcsCompatibilityType,
+  FieldStat,
+  FormatType
+} from '../_types/Structure'
+
+export class Response {
+  body: {
+    charset: string
+    ecs_compatibility?: EcsCompatibilityType
+    field_stats: Dictionary<Field, FieldStat>
+    format: FormatType
+    grok_pattern?: GrokPattern
+    java_timestamp_formats?: string[]
+    joda_timestamp_formats?: string[]
+    ingest_pipeline: PipelineConfig
+    mappings: TypeMapping
+    multiline_start_pattern?: string
+    need_client_timezone: boolean
+    num_lines_analyzed: integer
+    num_messages_analyzed: integer
+    sample_start: string
+    timestamp_field?: Field
+  }
+}