From 59262ac14ca20e7f27128958821071bec28ff2ff Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Fri, 16 Aug 2024 08:25:08 +0200 Subject: [PATCH] #703 Add maximum length for generated segment id fields. --- data/test13_expected/test13b_schema.json | 8 +++-- data/test14_expected/test14_schema.json | 8 +++-- data/test17_expected/test17b_schema.json | 12 ++++++-- data/test4_expected/test4_schema.json | 2 +- data/test5_expected/test5_schema.json | 2 +- data/test5_expected/test5a_schema.json | 2 +- data/test5_expected/test5b_schema.json | 2 +- data/test5_expected/test5c_schema.json | 2 +- data/test5_expected/test5d_schema.json | 8 +++-- .../cobol/parameters/MetadataFields.scala | 26 +++++++++++++++++ .../spark/cobol/schema/CobolSchema.scala | 25 +++++++++++++--- .../cobrix/spark/cobol/utils/SparkUtils.scala | 5 ++-- .../cobrix/spark/cobol/CobolSchemaSpec.scala | 29 +++++++++++++++---- 13 files changed, 105 insertions(+), 26 deletions(-) create mode 100644 spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/MetadataFields.scala diff --git a/data/test13_expected/test13b_schema.json b/data/test13_expected/test13b_schema.json index 91564a9a..3e3722b8 100644 --- a/data/test13_expected/test13b_schema.json +++ b/data/test13_expected/test13b_schema.json @@ -19,12 +19,16 @@ "name" : "Seg_Id0", "type" : "string", "nullable" : true, - "metadata" : { } + "metadata" : { + "maxLength" : 51 + } }, { "name" : "Seg_Id1", "type" : "string", "nullable" : true, - "metadata" : { } + "metadata" : { + "maxLength" : 51 + } }, { "name" : "SEGMENT_ID", "type" : "string", diff --git a/data/test14_expected/test14_schema.json b/data/test14_expected/test14_schema.json index 91564a9a..3e3722b8 100644 --- a/data/test14_expected/test14_schema.json +++ b/data/test14_expected/test14_schema.json @@ -19,12 +19,16 @@ "name" : "Seg_Id0", "type" : "string", "nullable" : true, - "metadata" : { } + "metadata" : { + "maxLength" : 51 + } }, { "name" : "Seg_Id1", "type" : "string", "nullable" : true, - "metadata" : { } + "metadata" : { + "maxLength" : 51 + } }, { "name" : "SEGMENT_ID", "type" : "string", diff --git a/data/test17_expected/test17b_schema.json b/data/test17_expected/test17b_schema.json index 61704acb..59e80e2c 100644 --- a/data/test17_expected/test17b_schema.json +++ b/data/test17_expected/test17b_schema.json @@ -19,17 +19,23 @@ "name" : "Seg_Id0", "type" : "string", "nullable" : true, - "metadata" : { } + "metadata" : { + "maxLength" : 51 + } }, { "name" : "Seg_Id1", "type" : "string", "nullable" : true, - "metadata" : { } + "metadata" : { + "maxLength" : 51 + } }, { "name" : "Seg_Id2", "type" : "string", "nullable" : true, - "metadata" : { } + "metadata" : { + "maxLength" : 51 + } }, { "name" : "SEGMENT_ID", "type" : "integer", diff --git a/data/test4_expected/test4_schema.json b/data/test4_expected/test4_schema.json index f70368f8..10a702b1 100644 --- a/data/test4_expected/test4_schema.json +++ b/data/test4_expected/test4_schema.json @@ -1 +1 @@ -{"type":"struct","fields":[{"name":"File_Id","type":"integer","nullable":false,"metadata":{}},{"name":"Record_Id","type":"long","nullable":false,"metadata":{}},{"name":"Record_Byte_Length","type":"integer","nullable":false,"metadata":{}},{"name":"Seg_Id0","type":"string","nullable":true,"metadata":{}},{"name":"Seg_Id1","type":"string","nullable":true,"metadata":{}},{"name":"SEGMENT_ID","type":"string","nullable":true,"metadata":{"maxLength":5}},{"name":"COMPANY_ID","type":"string","nullable":true,"metadata":{"maxLength":10}},{"name":"STATIC_DETAILS","type":{"type":"struct","fields":[{"name":"COMPANY_NAME","type":"string","nullable":true,"metadata":{"maxLength":15}},{"name":"ADDRESS","type":"string","nullable":true,"metadata":{"maxLength":25}},{"name":"TAXPAYER","type":{"type":"struct","fields":[{"name":"TAXPAYER_TYPE","type":"string","nullable":true,"metadata":{"maxLength":1}},{"name":"TAXPAYER_STR","type":"string","nullable":true,"metadata":{"maxLength":8}},{"name":"TAXPAYER_NUM","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"CONTACTS","type":{"type":"struct","fields":[{"name":"PHONE_NUMBER","type":"string","nullable":true,"metadata":{"maxLength":17}},{"name":"CONTACT_PERSON","type":"string","nullable":true,"metadata":{"maxLength":28}}]},"nullable":true,"metadata":{}}]} \ No newline at end of file +{"type":"struct","fields":[{"name":"File_Id","type":"integer","nullable":false,"metadata":{}},{"name":"Record_Id","type":"long","nullable":false,"metadata":{}},{"name":"Record_Byte_Length","type":"integer","nullable":false,"metadata":{}},{"name":"Seg_Id0","type":"string","nullable":true,"metadata":{"maxLength":51}},{"name":"Seg_Id1","type":"string","nullable":true,"metadata":{"maxLength":51}},{"name":"SEGMENT_ID","type":"string","nullable":true,"metadata":{"maxLength":5}},{"name":"COMPANY_ID","type":"string","nullable":true,"metadata":{"maxLength":10}},{"name":"STATIC_DETAILS","type":{"type":"struct","fields":[{"name":"COMPANY_NAME","type":"string","nullable":true,"metadata":{"maxLength":15}},{"name":"ADDRESS","type":"string","nullable":true,"metadata":{"maxLength":25}},{"name":"TAXPAYER","type":{"type":"struct","fields":[{"name":"TAXPAYER_TYPE","type":"string","nullable":true,"metadata":{"maxLength":1}},{"name":"TAXPAYER_STR","type":"string","nullable":true,"metadata":{"maxLength":8}},{"name":"TAXPAYER_NUM","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"CONTACTS","type":{"type":"struct","fields":[{"name":"PHONE_NUMBER","type":"string","nullable":true,"metadata":{"maxLength":17}},{"name":"CONTACT_PERSON","type":"string","nullable":true,"metadata":{"maxLength":28}}]},"nullable":true,"metadata":{}}]} \ No newline at end of file diff --git a/data/test5_expected/test5_schema.json b/data/test5_expected/test5_schema.json index f70368f8..10a702b1 100644 --- a/data/test5_expected/test5_schema.json +++ b/data/test5_expected/test5_schema.json @@ -1 +1 @@ -{"type":"struct","fields":[{"name":"File_Id","type":"integer","nullable":false,"metadata":{}},{"name":"Record_Id","type":"long","nullable":false,"metadata":{}},{"name":"Record_Byte_Length","type":"integer","nullable":false,"metadata":{}},{"name":"Seg_Id0","type":"string","nullable":true,"metadata":{}},{"name":"Seg_Id1","type":"string","nullable":true,"metadata":{}},{"name":"SEGMENT_ID","type":"string","nullable":true,"metadata":{"maxLength":5}},{"name":"COMPANY_ID","type":"string","nullable":true,"metadata":{"maxLength":10}},{"name":"STATIC_DETAILS","type":{"type":"struct","fields":[{"name":"COMPANY_NAME","type":"string","nullable":true,"metadata":{"maxLength":15}},{"name":"ADDRESS","type":"string","nullable":true,"metadata":{"maxLength":25}},{"name":"TAXPAYER","type":{"type":"struct","fields":[{"name":"TAXPAYER_TYPE","type":"string","nullable":true,"metadata":{"maxLength":1}},{"name":"TAXPAYER_STR","type":"string","nullable":true,"metadata":{"maxLength":8}},{"name":"TAXPAYER_NUM","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"CONTACTS","type":{"type":"struct","fields":[{"name":"PHONE_NUMBER","type":"string","nullable":true,"metadata":{"maxLength":17}},{"name":"CONTACT_PERSON","type":"string","nullable":true,"metadata":{"maxLength":28}}]},"nullable":true,"metadata":{}}]} \ No newline at end of file +{"type":"struct","fields":[{"name":"File_Id","type":"integer","nullable":false,"metadata":{}},{"name":"Record_Id","type":"long","nullable":false,"metadata":{}},{"name":"Record_Byte_Length","type":"integer","nullable":false,"metadata":{}},{"name":"Seg_Id0","type":"string","nullable":true,"metadata":{"maxLength":51}},{"name":"Seg_Id1","type":"string","nullable":true,"metadata":{"maxLength":51}},{"name":"SEGMENT_ID","type":"string","nullable":true,"metadata":{"maxLength":5}},{"name":"COMPANY_ID","type":"string","nullable":true,"metadata":{"maxLength":10}},{"name":"STATIC_DETAILS","type":{"type":"struct","fields":[{"name":"COMPANY_NAME","type":"string","nullable":true,"metadata":{"maxLength":15}},{"name":"ADDRESS","type":"string","nullable":true,"metadata":{"maxLength":25}},{"name":"TAXPAYER","type":{"type":"struct","fields":[{"name":"TAXPAYER_TYPE","type":"string","nullable":true,"metadata":{"maxLength":1}},{"name":"TAXPAYER_STR","type":"string","nullable":true,"metadata":{"maxLength":8}},{"name":"TAXPAYER_NUM","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"CONTACTS","type":{"type":"struct","fields":[{"name":"PHONE_NUMBER","type":"string","nullable":true,"metadata":{"maxLength":17}},{"name":"CONTACT_PERSON","type":"string","nullable":true,"metadata":{"maxLength":28}}]},"nullable":true,"metadata":{}}]} \ No newline at end of file diff --git a/data/test5_expected/test5a_schema.json b/data/test5_expected/test5a_schema.json index 39a25186..18859fff 100644 --- a/data/test5_expected/test5a_schema.json +++ b/data/test5_expected/test5a_schema.json @@ -1 +1 @@ -{"type":"struct","fields":[{"name":"File_Id","type":"integer","nullable":false,"metadata":{}},{"name":"Record_Id","type":"long","nullable":false,"metadata":{}},{"name":"Record_Byte_Length","type":"integer","nullable":false,"metadata":{}},{"name":"Seg_Id0","type":"string","nullable":true,"metadata":{}},{"name":"SEGMENT_ID","type":"string","nullable":true,"metadata":{"maxLength":5}},{"name":"COMPANY_ID","type":"string","nullable":true,"metadata":{"maxLength":10}},{"name":"STATIC_DETAILS","type":{"type":"struct","fields":[{"name":"COMPANY_NAME","type":"string","nullable":true,"metadata":{"maxLength":15}},{"name":"ADDRESS","type":"string","nullable":true,"metadata":{"maxLength":25}},{"name":"TAXPAYER","type":{"type":"struct","fields":[{"name":"TAXPAYER_TYPE","type":"string","nullable":true,"metadata":{"maxLength":1}},{"name":"TAXPAYER_STR","type":"string","nullable":true,"metadata":{"maxLength":8}},{"name":"TAXPAYER_NUM","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"CONTACTS","type":{"type":"struct","fields":[{"name":"PHONE_NUMBER","type":"string","nullable":true,"metadata":{"maxLength":17}},{"name":"CONTACT_PERSON","type":"string","nullable":true,"metadata":{"maxLength":28}}]},"nullable":true,"metadata":{}}]} \ No newline at end of file +{"type":"struct","fields":[{"name":"File_Id","type":"integer","nullable":false,"metadata":{}},{"name":"Record_Id","type":"long","nullable":false,"metadata":{}},{"name":"Record_Byte_Length","type":"integer","nullable":false,"metadata":{}},{"name":"Seg_Id0","type":"string","nullable":true,"metadata":{"maxLength":51}},{"name":"SEGMENT_ID","type":"string","nullable":true,"metadata":{"maxLength":5}},{"name":"COMPANY_ID","type":"string","nullable":true,"metadata":{"maxLength":10}},{"name":"STATIC_DETAILS","type":{"type":"struct","fields":[{"name":"COMPANY_NAME","type":"string","nullable":true,"metadata":{"maxLength":15}},{"name":"ADDRESS","type":"string","nullable":true,"metadata":{"maxLength":25}},{"name":"TAXPAYER","type":{"type":"struct","fields":[{"name":"TAXPAYER_TYPE","type":"string","nullable":true,"metadata":{"maxLength":1}},{"name":"TAXPAYER_STR","type":"string","nullable":true,"metadata":{"maxLength":8}},{"name":"TAXPAYER_NUM","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"CONTACTS","type":{"type":"struct","fields":[{"name":"PHONE_NUMBER","type":"string","nullable":true,"metadata":{"maxLength":17}},{"name":"CONTACT_PERSON","type":"string","nullable":true,"metadata":{"maxLength":28}}]},"nullable":true,"metadata":{}}]} \ No newline at end of file diff --git a/data/test5_expected/test5b_schema.json b/data/test5_expected/test5b_schema.json index f70368f8..10a702b1 100644 --- a/data/test5_expected/test5b_schema.json +++ b/data/test5_expected/test5b_schema.json @@ -1 +1 @@ -{"type":"struct","fields":[{"name":"File_Id","type":"integer","nullable":false,"metadata":{}},{"name":"Record_Id","type":"long","nullable":false,"metadata":{}},{"name":"Record_Byte_Length","type":"integer","nullable":false,"metadata":{}},{"name":"Seg_Id0","type":"string","nullable":true,"metadata":{}},{"name":"Seg_Id1","type":"string","nullable":true,"metadata":{}},{"name":"SEGMENT_ID","type":"string","nullable":true,"metadata":{"maxLength":5}},{"name":"COMPANY_ID","type":"string","nullable":true,"metadata":{"maxLength":10}},{"name":"STATIC_DETAILS","type":{"type":"struct","fields":[{"name":"COMPANY_NAME","type":"string","nullable":true,"metadata":{"maxLength":15}},{"name":"ADDRESS","type":"string","nullable":true,"metadata":{"maxLength":25}},{"name":"TAXPAYER","type":{"type":"struct","fields":[{"name":"TAXPAYER_TYPE","type":"string","nullable":true,"metadata":{"maxLength":1}},{"name":"TAXPAYER_STR","type":"string","nullable":true,"metadata":{"maxLength":8}},{"name":"TAXPAYER_NUM","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"CONTACTS","type":{"type":"struct","fields":[{"name":"PHONE_NUMBER","type":"string","nullable":true,"metadata":{"maxLength":17}},{"name":"CONTACT_PERSON","type":"string","nullable":true,"metadata":{"maxLength":28}}]},"nullable":true,"metadata":{}}]} \ No newline at end of file +{"type":"struct","fields":[{"name":"File_Id","type":"integer","nullable":false,"metadata":{}},{"name":"Record_Id","type":"long","nullable":false,"metadata":{}},{"name":"Record_Byte_Length","type":"integer","nullable":false,"metadata":{}},{"name":"Seg_Id0","type":"string","nullable":true,"metadata":{"maxLength":51}},{"name":"Seg_Id1","type":"string","nullable":true,"metadata":{"maxLength":51}},{"name":"SEGMENT_ID","type":"string","nullable":true,"metadata":{"maxLength":5}},{"name":"COMPANY_ID","type":"string","nullable":true,"metadata":{"maxLength":10}},{"name":"STATIC_DETAILS","type":{"type":"struct","fields":[{"name":"COMPANY_NAME","type":"string","nullable":true,"metadata":{"maxLength":15}},{"name":"ADDRESS","type":"string","nullable":true,"metadata":{"maxLength":25}},{"name":"TAXPAYER","type":{"type":"struct","fields":[{"name":"TAXPAYER_TYPE","type":"string","nullable":true,"metadata":{"maxLength":1}},{"name":"TAXPAYER_STR","type":"string","nullable":true,"metadata":{"maxLength":8}},{"name":"TAXPAYER_NUM","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"CONTACTS","type":{"type":"struct","fields":[{"name":"PHONE_NUMBER","type":"string","nullable":true,"metadata":{"maxLength":17}},{"name":"CONTACT_PERSON","type":"string","nullable":true,"metadata":{"maxLength":28}}]},"nullable":true,"metadata":{}}]} \ No newline at end of file diff --git a/data/test5_expected/test5c_schema.json b/data/test5_expected/test5c_schema.json index 39a25186..18859fff 100644 --- a/data/test5_expected/test5c_schema.json +++ b/data/test5_expected/test5c_schema.json @@ -1 +1 @@ -{"type":"struct","fields":[{"name":"File_Id","type":"integer","nullable":false,"metadata":{}},{"name":"Record_Id","type":"long","nullable":false,"metadata":{}},{"name":"Record_Byte_Length","type":"integer","nullable":false,"metadata":{}},{"name":"Seg_Id0","type":"string","nullable":true,"metadata":{}},{"name":"SEGMENT_ID","type":"string","nullable":true,"metadata":{"maxLength":5}},{"name":"COMPANY_ID","type":"string","nullable":true,"metadata":{"maxLength":10}},{"name":"STATIC_DETAILS","type":{"type":"struct","fields":[{"name":"COMPANY_NAME","type":"string","nullable":true,"metadata":{"maxLength":15}},{"name":"ADDRESS","type":"string","nullable":true,"metadata":{"maxLength":25}},{"name":"TAXPAYER","type":{"type":"struct","fields":[{"name":"TAXPAYER_TYPE","type":"string","nullable":true,"metadata":{"maxLength":1}},{"name":"TAXPAYER_STR","type":"string","nullable":true,"metadata":{"maxLength":8}},{"name":"TAXPAYER_NUM","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"CONTACTS","type":{"type":"struct","fields":[{"name":"PHONE_NUMBER","type":"string","nullable":true,"metadata":{"maxLength":17}},{"name":"CONTACT_PERSON","type":"string","nullable":true,"metadata":{"maxLength":28}}]},"nullable":true,"metadata":{}}]} \ No newline at end of file +{"type":"struct","fields":[{"name":"File_Id","type":"integer","nullable":false,"metadata":{}},{"name":"Record_Id","type":"long","nullable":false,"metadata":{}},{"name":"Record_Byte_Length","type":"integer","nullable":false,"metadata":{}},{"name":"Seg_Id0","type":"string","nullable":true,"metadata":{"maxLength":51}},{"name":"SEGMENT_ID","type":"string","nullable":true,"metadata":{"maxLength":5}},{"name":"COMPANY_ID","type":"string","nullable":true,"metadata":{"maxLength":10}},{"name":"STATIC_DETAILS","type":{"type":"struct","fields":[{"name":"COMPANY_NAME","type":"string","nullable":true,"metadata":{"maxLength":15}},{"name":"ADDRESS","type":"string","nullable":true,"metadata":{"maxLength":25}},{"name":"TAXPAYER","type":{"type":"struct","fields":[{"name":"TAXPAYER_TYPE","type":"string","nullable":true,"metadata":{"maxLength":1}},{"name":"TAXPAYER_STR","type":"string","nullable":true,"metadata":{"maxLength":8}},{"name":"TAXPAYER_NUM","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"CONTACTS","type":{"type":"struct","fields":[{"name":"PHONE_NUMBER","type":"string","nullable":true,"metadata":{"maxLength":17}},{"name":"CONTACT_PERSON","type":"string","nullable":true,"metadata":{"maxLength":28}}]},"nullable":true,"metadata":{}}]} \ No newline at end of file diff --git a/data/test5_expected/test5d_schema.json b/data/test5_expected/test5d_schema.json index 5d55074c..69507a0b 100644 --- a/data/test5_expected/test5d_schema.json +++ b/data/test5_expected/test5d_schema.json @@ -19,12 +19,16 @@ "name" : "Seg_Id0", "type" : "string", "nullable" : true, - "metadata" : { } + "metadata" : { + "maxLength" : 51 + } }, { "name" : "Seg_Id1", "type" : "string", "nullable" : true, - "metadata" : { } + "metadata" : { + "maxLength" : 51 + } }, { "name" : "RECORD_LENGTH", "type" : "integer", diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/MetadataFields.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/MetadataFields.scala new file mode 100644 index 00000000..f8928e32 --- /dev/null +++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/MetadataFields.scala @@ -0,0 +1,26 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.cobrix.spark.cobol.parameters + +object MetadataFields { + // Metadata for 'string' + val MAX_LENGTH = "maxLength" + + // Metadata for 'array' + val MIN_ELEMENTS = "minElements" + val MAX_ELEMENTS = "maxElements" +} diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala index 490c185b..8af8daa2 100644 --- a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala +++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala @@ -28,6 +28,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaRetentionPolicy import za.co.absa.cobrix.cobol.reader.schema.{CobolSchema => CobolReaderSchema} import za.co.absa.cobrix.spark.cobol.parameters.CobolParametersParser.getReaderProperties +import za.co.absa.cobrix.spark.cobol.parameters.MetadataFields.{MAX_ELEMENTS, MAX_LENGTH, MIN_ELEMENTS} import za.co.absa.cobrix.spark.cobol.parameters.{CobolParametersParser, Parameters} import scala.collection.mutable @@ -101,7 +102,14 @@ class CobolSchema(copybook: Copybook, val recordsWithSegmentFields = if (generateSegIdFieldsCnt > 0) { val newFields = for (level <- Range(0, generateSegIdFieldsCnt)) - yield StructField(s"${Constants.segmentIdField}$level", StringType, nullable = true) + yield { + val maxPrefixLength = getMaximumSegmentIdLength(segmentIdProvidedPrefix) + val segFieldMetadata = new MetadataBuilder() + segFieldMetadata.putLong(MAX_LENGTH, maxPrefixLength.toLong) + + StructField(s"${Constants.segmentIdField}$level", StringType, nullable = true, metadata = segFieldMetadata.build()) + } + newFields.toArray ++ expandRecords } else { expandRecords @@ -130,6 +138,15 @@ class CobolSchema(copybook: Copybook, StructType(recordsWithRecordId) } + private [cobrix] def getMaximumSegmentIdLength(segmentIdProvidedPrefix: String): Int = { + val DATETIME_PREFIX_LENGTH = 15 + val SEGMENT_ID_MAX_GENERATED_LENGTH = 50 + + val prefixLength = if (segmentIdProvidedPrefix.isEmpty) DATETIME_PREFIX_LENGTH else segmentIdProvidedPrefix.length + + prefixLength + SEGMENT_ID_MAX_GENERATED_LENGTH + } + @throws(classOf[IllegalStateException]) private def parseGroup(group: Group, segmentRedefines: List[Group]): StructField = { val fields = group.children.flatMap(field => { @@ -210,12 +227,12 @@ class CobolSchema(copybook: Copybook, } private def addArrayMetadata(metadataBuilder: MetadataBuilder, st: Statement): MetadataBuilder = { - metadataBuilder.putLong("minElements", st.arrayMinSize) - metadataBuilder.putLong("maxElements", st.arrayMaxSize) + metadataBuilder.putLong(MIN_ELEMENTS, st.arrayMinSize) + metadataBuilder.putLong(MAX_ELEMENTS, st.arrayMaxSize) } private def addAlphaNumericMetadata(metadataBuilder: MetadataBuilder, a: AlphaNumeric): MetadataBuilder = { - metadataBuilder.putLong("maxLength", a.length) + metadataBuilder.putLong(MAX_LENGTH, a.length) } private def addExtendedMetadata(metadataBuilder: MetadataBuilder, s: Statement): MetadataBuilder = { diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/utils/SparkUtils.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/utils/SparkUtils.scala index 7a58d42b..2d501db7 100644 --- a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/utils/SparkUtils.scala +++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/utils/SparkUtils.scala @@ -24,6 +24,7 @@ import za.co.absa.cobrix.spark.cobol.utils.impl.HofsWrapper.transform import org.apache.spark.sql.types._ import org.apache.spark.sql.{Column, DataFrame, SparkSession} import za.co.absa.cobrix.cobol.internal.Logging +import za.co.absa.cobrix.spark.cobol.parameters.MetadataFields.MAX_ELEMENTS import scala.annotation.tailrec import scala.collection.mutable @@ -129,8 +130,8 @@ object SparkUtils extends Logging { def getMaxArraySize(path: String): Int = { getField(path, df.schema) match { - case Some(field) if field.metadata.contains("maxElements") => - field.metadata.getLong("maxElements").toInt + case Some(field) if field.metadata.contains(MAX_ELEMENTS) => + field.metadata.getLong(MAX_ELEMENTS).toInt case _ => val collected = df.agg(max(expr(s"size($path)"))).collect()(0)(0) if (collected != null) { diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala index f047614a..8d3c2034 100644 --- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala +++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala @@ -21,6 +21,7 @@ import org.scalatest.wordspec.AnyWordSpec import org.slf4j.{Logger, LoggerFactory} import za.co.absa.cobrix.cobol.parser.CopybookParser import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy +import za.co.absa.cobrix.spark.cobol.parameters.MetadataFields.MAX_LENGTH import za.co.absa.cobrix.spark.cobol.schema.CobolSchema import za.co.absa.cobrix.spark.cobol.source.base.SimpleComparisonBase @@ -436,14 +437,14 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { val metadataStr2 = actualSparkSchema.fields(1).metadata val metadataNum3 = actualSparkSchema.fields(2).metadata - assert(metadataStr1.contains("maxLength")) - assert(metadataStr2.contains("maxLength")) - assert(!metadataNum3.contains("maxLength")) + assert(metadataStr1.contains(MAX_LENGTH)) + assert(metadataStr2.contains(MAX_LENGTH)) + assert(!metadataNum3.contains(MAX_LENGTH)) - actualSparkSchema.fields(1).metadata.getLong("maxLength") + actualSparkSchema.fields(1).metadata.getLong(MAX_LENGTH) - assert(metadataStr1.getLong("maxLength") == 10) - assert(metadataStr2.getLong("maxLength") == 7) + assert(metadataStr1.getLong(MAX_LENGTH) == 10) + assert(metadataStr2.getLong(MAX_LENGTH) == 7) } "fromSparkOptions" should { @@ -646,4 +647,20 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { } } + "getMaximumSegmentIdLength" should { + val copybook: String = + """ 01 RECORD. + | 05 STR1 PIC X(10). + |""".stripMargin + + val cobolSchema = CobolSchema.fromSparkOptions(Seq(copybook), Map.empty) + + "return proper size for autogenerated prefix" in { + assert(cobolSchema.getMaximumSegmentIdLength("") == 65) + } + + "return proper size for provided prefix" in { + assert(cobolSchema.getMaximumSegmentIdLength("ID_") == 53) + } + } }