From 60d270848cb151dc792fdd7bde43e4041c76db5e Mon Sep 17 00:00:00 2001
From: Ruslan Iushchenko
Date: Tue, 24 Feb 2026 08:46:40 +0100
Subject: [PATCH 1/3] #822 Allow '_corrupt_records' to extract data in HEX
instead of binary data type.
---
.../extractors/record/RecordExtractors.scala | 65 +++++++++-------
.../iterator/FixedLenNestedRowIterator.scala | 7 +-
.../iterator/VarLenNestedIterator.scala | 7 +-
.../parameters/CobolParametersParser.scala | 12 ++-
.../parameters/CorruptFieldsPolicy.scala | 25 +++++++
.../reader/parameters/ReaderParameters.scala | 4 +-
.../cobol/reader/schema/CobolSchema.scala | 8 +-
.../builder/SparkCobolOptionsBuilder.scala | 7 +-
.../spark/cobol/schema/CobolSchema.scala | 31 ++++++--
.../cobrix/spark/cobol/CobolSchemaSpec.scala | 23 +++++-
.../source/base/impl/DummyCobolSchema.scala | 3 +-
.../integration/Test41CorruptFieldsSpec.scala | 75 ++++++++++++++++++-
12 files changed, 215 insertions(+), 52 deletions(-)
create mode 100644 cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CorruptFieldsPolicy.scala
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala
index 838c0e36..c23ae957 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala
@@ -35,21 +35,22 @@ object RecordExtractors {
/**
* This method extracts a record from the specified array of bytes. The copybook for the record needs to be already parsed.
*
- * @param ast The parsed copybook.
- * @param data The data bits containing the record.
- * @param offsetBytes The offset to the beginning of the record (in bits).
- * @param policy A schema retention policy to be applied to the extracted record.
- * @param variableLengthOccurs If true, OCCURS DEPENDING ON data size will depend on the number of elements.
- * @param generateRecordId If true, a record id field will be added as the first field of the record.
- * @param generateRecordBytes If true, a record bytes field will be added at the beginning of each record.
- * @param generateCorruptFields If true, a corrupt fields field will be appended to the end of the schema.
- * @param segmentLevelIds Segment ids to put to the extracted record if id generation it turned on.
- * @param fileId A file id to be put to the extractor record if generateRecordId == true.
- * @param recordId The record id to be saved to the record id field.
- * @param activeSegmentRedefine An active segment redefine (the one that will be parsed).
- * All other segment redefines will be skipped.
- * @param generateInputFileField if true, a field containing input file name will be generated
- * @param inputFileName An input file name to put if its generation is needed
+ * @param ast The parsed copybook.
+ * @param data The data bits containing the record.
+ * @param offsetBytes The offset to the beginning of the record (in bits).
+ * @param policy A schema retention policy to be applied to the extracted record.
+ * @param variableLengthOccurs If true, OCCURS DEPENDING ON data size will depend on the number of elements.
+ * @param generateRecordId If true, a record id field will be added as the first field of the record.
+ * @param generateRecordBytes If true, a record bytes field will be added at the beginning of each record.
+ * @param generateCorruptFields If true, a corrupt fields field will be appended to the end of the schema.
+ * @param generateCorruptFieldsAsHex If true, corrupt fields will be generated as hex strings, otherwise they will be generated as binary data. This parameter is only relevant if generateCorruptFields is true.
+ * @param segmentLevelIds Segment ids to put to the extracted record if id generation it turned on.
+ * @param fileId A file id to be put to the extractor record if generateRecordId == true.
+ * @param recordId The record id to be saved to the record id field.
+ * @param activeSegmentRedefine An active segment redefine (the one that will be parsed).
+ * All other segment redefines will be skipped.
+ * @param generateInputFileField if true, a field containing input file name will be generated
+ * @param inputFileName An input file name to put if its generation is needed
* @return An Array[Any] object corresponding to the record schema.
*/
@throws(classOf[IllegalStateException])
@@ -62,6 +63,7 @@ object RecordExtractors {
generateRecordId: Boolean = false,
generateRecordBytes: Boolean = false,
generateCorruptFields: Boolean = false,
+ generateCorruptFieldsAsHex: Boolean = false,
segmentLevelIds: List[String] = Nil,
fileId: Int = 0,
recordId: Long = 0,
@@ -213,7 +215,7 @@ object RecordExtractors {
policy
}
- applyRecordPostProcessing(ast, records.toList, effectiveSchemaRetentionPolicy, generateRecordId, generateRecordBytes, generateCorruptFields, segmentLevelIds, fileId, recordId, data.length, data, generateInputFileField, inputFileName, corruptFields, handler)
+ applyRecordPostProcessing(ast, records.toList, effectiveSchemaRetentionPolicy, generateRecordId, generateRecordBytes, generateCorruptFields, generateCorruptFieldsAsHex, segmentLevelIds, fileId, recordId, data.length, data, generateInputFileField, inputFileName, corruptFields, handler)
}
/**
@@ -433,7 +435,7 @@ object RecordExtractors {
policy
}
- applyRecordPostProcessing(ast, records.toList, effectiveSchemaRetentionPolicy, generateRecordId, generateRecordBytes = false, generateCorruptFields = false, Nil, fileId, recordId, recordLength, Array.empty[Byte], generateInputFileField = generateInputFileField, inputFileName, null, handler)
+ applyRecordPostProcessing(ast, records.toList, effectiveSchemaRetentionPolicy, generateRecordId, generateRecordBytes = false, generateCorruptFields = false, generateCorruptFieldsAsHex = false, Nil, fileId, recordId, recordLength, Array.empty[Byte], generateInputFileField = generateInputFileField, inputFileName, null, handler)
}
/**
@@ -449,16 +451,18 @@ object RecordExtractors {
* Combinations of the listed transformations are supported.
*
*
- * @param ast The parsed copybook
- * @param records The array of [[T]] object for each Group of the copybook
- * @param generateRecordId If true a record id field will be added as the first field of the record.
- * @param generateRecordBytes If true a record bytes field will be added at the beginning of the record.
- * @param generateCorruptFields If true,a corrupt fields field will be appended to the end of the schema.
- * @param fileId The file id to be saved to the file id field
- * @param recordId The record id to be saved to the record id field
- * @param recordByteLength The length of the record
- * @param generateInputFileField if true, a field containing input file name will be generated
- * @param inputFileName An input file name to put if its generation is needed
+ * @param ast The parsed copybook
+ * @param records The array of [[T]] object for each Group of the copybook
+ * @param generateRecordId If true a record id field will be added as the first field of the record.
+ * @param generateRecordBytes If true a record bytes field will be added at the beginning of the record.
+ * @param generateCorruptFields If true,a corrupt fields field will be appended to the end of the schema.
+ * @param generateCorruptFieldsAsHex If true, corrupt fields will be generated as hex strings, otherwise they will be generated as binary data. This parameter is only relevant if generateCorruptFields is true.
+ * @param segmentLevelIds Segment ids to put to the extracted record if id generation it turned on.
+ * @param fileId The file id to be saved to the file id field
+ * @param recordId The record id to be saved to the record id field
+ * @param recordByteLength The length of the record
+ * @param generateInputFileField if true, a field containing input file name will be generated
+ * @param inputFileName An input file name to put if its generation is needed
* @return A [[T]] object corresponding to the record schema
*/
private def applyRecordPostProcessing[T](
@@ -468,6 +472,7 @@ object RecordExtractors {
generateRecordId: Boolean,
generateRecordBytes: Boolean,
generateCorruptFields: Boolean,
+ generateCorruptFieldsAsHex: Boolean = false,
segmentLevelIds: List[String],
fileId: Int,
recordId: Long,
@@ -515,7 +520,11 @@ object RecordExtractors {
val ar = new Array[Any](len)
var i = 0
while (i < len) {
- val r = handler.create(Array[Any](corruptFields(i).fieldName, corruptFields(i).rawValue), corruptFieldsGroup)
+ val r = if (generateCorruptFieldsAsHex) {
+ handler.create(Array[Any](corruptFields(i).fieldName, corruptFields(i).rawValue.map("%02X" format _).mkString), corruptFieldsGroup)
+ } else {
+ handler.create(Array[Any](corruptFields(i).fieldName, corruptFields(i).rawValue), corruptFieldsGroup)
+ }
ar(i) = r
i += 1
}
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/iterator/FixedLenNestedRowIterator.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/iterator/FixedLenNestedRowIterator.scala
index a6a8fb8d..ec4cda4f 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/iterator/FixedLenNestedRowIterator.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/iterator/FixedLenNestedRowIterator.scala
@@ -18,7 +18,7 @@ package za.co.absa.cobrix.cobol.reader.iterator
import za.co.absa.cobrix.cobol.internal.Logging
import za.co.absa.cobrix.cobol.reader.extractors.record.{RecordExtractors, RecordHandler}
-import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
+import za.co.absa.cobrix.cobol.reader.parameters.{CorruptFieldsPolicy, ReaderParameters}
import za.co.absa.cobrix.cobol.reader.schema.CobolSchema
import za.co.absa.cobrix.cobol.reader.validator.ReaderParametersValidator
@@ -47,6 +47,8 @@ class FixedLenNestedRowIterator[T: ClassTag](
private val segmentIdFilter = readerProperties.multisegment.flatMap(_.segmentIdFilter)
private val segmentRedefineMap = readerProperties.multisegment.map(_.segmentIdRedefineMap).getOrElse(HashMap[String, String]())
private val segmentRedefineAvailable = segmentRedefineMap.nonEmpty
+ private val generateCorruptFields = readerProperties.corruptFieldsPolicy != CorruptFieldsPolicy.Disabled
+ private val generateCorruptFieldsAsHex = readerProperties.corruptFieldsPolicy == CorruptFieldsPolicy.Hex
override def hasNext: Boolean = {
val correctOffset = if (singleRecordOnly) {
@@ -90,7 +92,8 @@ class FixedLenNestedRowIterator[T: ClassTag](
readerProperties.schemaPolicy,
readerProperties.variableSizeOccurs,
generateRecordBytes = readerProperties.generateRecordBytes,
- generateCorruptFields = readerProperties.generateCorruptFields,
+ generateCorruptFields = generateCorruptFields,
+ generateCorruptFieldsAsHex = generateCorruptFieldsAsHex,
activeSegmentRedefine = activeSegmentRedefine,
handler = handler
)
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/iterator/VarLenNestedIterator.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/iterator/VarLenNestedIterator.scala
index 0f9a231e..edc6459a 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/iterator/VarLenNestedIterator.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/iterator/VarLenNestedIterator.scala
@@ -20,7 +20,7 @@ import za.co.absa.cobrix.cobol.parser.Copybook
import za.co.absa.cobrix.cobol.parser.headerparsers.RecordHeaderParser
import za.co.absa.cobrix.cobol.reader.extractors.raw.RawRecordExtractor
import za.co.absa.cobrix.cobol.reader.extractors.record.{RecordExtractors, RecordHandler}
-import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
+import za.co.absa.cobrix.cobol.reader.parameters.{CorruptFieldsPolicy, ReaderParameters}
import za.co.absa.cobrix.cobol.reader.stream.SimpleStream
import scala.collection.immutable.HashMap
@@ -60,6 +60,8 @@ final class VarLenNestedIterator[T: ClassTag](cobolSchema: Copybook,
private val segmentRedefineMap = readerProperties.multisegment.map(_.segmentIdRedefineMap).getOrElse(HashMap[String, String]())
private val segmentRedefineAvailable = segmentRedefineMap.nonEmpty
private val generateInputFileName = readerProperties.inputFileNameColumn.nonEmpty
+ private val generateCorruptFields = readerProperties.corruptFieldsPolicy != CorruptFieldsPolicy.Disabled
+ private val generateCorruptFieldsAsHex = readerProperties.corruptFieldsPolicy == CorruptFieldsPolicy.Hex
fetchNext()
@@ -99,7 +101,8 @@ final class VarLenNestedIterator[T: ClassTag](cobolSchema: Copybook,
readerProperties.variableSizeOccurs,
readerProperties.generateRecordId,
readerProperties.generateRecordBytes,
- readerProperties.generateCorruptFields,
+ generateCorruptFields,
+ generateCorruptFieldsAsHex,
segmentLevelIds,
fileId,
rawRecordIterator.getRecordIndex,
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala
index 6f9878ec..2f37a1a0 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala
@@ -400,6 +400,16 @@ object CobolParametersParser extends Logging {
else
None
+ val corruptFieldsPolicy = if (parameters.generateCorruptFields) {
+ if (parameters.decodeBinaryAsHex) {
+ CorruptFieldsPolicy.Hex
+ } else {
+ CorruptFieldsPolicy.Binary
+ }
+ } else {
+ CorruptFieldsPolicy.Disabled
+ }
+
ReaderParameters(
recordFormat = parameters.recordFormat,
isEbcdic = parameters.isEbcdic,
@@ -433,7 +443,7 @@ object CobolParametersParser extends Logging {
fileEndOffset = varLenParams.fileEndOffset,
generateRecordId = varLenParams.generateRecordId,
generateRecordBytes = parameters.generateRecordBytes,
- generateCorruptFields = parameters.generateCorruptFields,
+ corruptFieldsPolicy = corruptFieldsPolicy,
schemaPolicy = parameters.schemaRetentionPolicy,
stringTrimmingPolicy = parameters.stringTrimmingPolicy,
isDisplayAlwaysString = parameters.isDisplayAlwaysString,
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CorruptFieldsPolicy.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CorruptFieldsPolicy.scala
new file mode 100644
index 00000000..ea1c3c25
--- /dev/null
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CorruptFieldsPolicy.scala
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.cobol.reader.parameters
+
+sealed trait CorruptFieldsPolicy
+
+object CorruptFieldsPolicy {
+ case object Disabled extends CorruptFieldsPolicy
+ case object Binary extends CorruptFieldsPolicy
+ case object Hex extends CorruptFieldsPolicy
+}
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala
index 4a527e8a..9904fb45 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala
@@ -58,7 +58,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaReten
* @param fileEndOffset A number of bytes to skip at the end of each file
* @param generateRecordId If true, a record id field will be prepended to each record.
* @param generateRecordBytes Generate 'record_bytes' field containing raw bytes of the original record
- * @param generateCorruptFields Generate '_corrupt_fields' field for fields that haven't converted successfully
+ * @param corruptFieldsPolicy Specifies if '_corrupt_fields' field for fields that haven't converted successfully, and the type of raw values.
* @param schemaPolicy Specifies a policy to transform the input schema. The default policy is to keep the schema exactly as it is in the copybook.
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed.
* @param isDisplayAlwaysString If true, all fields having DISPLAY format will remain strings and won't be converted to numbers.
@@ -112,7 +112,7 @@ case class ReaderParameters(
fileEndOffset: Int = 0,
generateRecordId: Boolean = false,
generateRecordBytes: Boolean = false,
- generateCorruptFields: Boolean = false,
+ corruptFieldsPolicy: CorruptFieldsPolicy = CorruptFieldsPolicy.Disabled,
schemaPolicy: SchemaRetentionPolicy = SchemaRetentionPolicy.CollapseRoot,
stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
isDisplayAlwaysString: Boolean = false,
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala
index 69a025c8..282e92e1 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala
@@ -20,7 +20,7 @@ import za.co.absa.cobrix.cobol.parser.encoding.codepage.CodePage
import za.co.absa.cobrix.cobol.parser.encoding.{ASCII, EBCDIC}
import za.co.absa.cobrix.cobol.parser.policies.MetadataPolicy
import za.co.absa.cobrix.cobol.parser.{Copybook, CopybookParser}
-import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
+import za.co.absa.cobrix.cobol.reader.parameters.{CorruptFieldsPolicy, ReaderParameters}
import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaRetentionPolicy
import java.nio.charset.{Charset, StandardCharsets}
@@ -39,7 +39,7 @@ import scala.collection.immutable.HashMap
* @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook.
* @param generateRecordId If true, a record id field will be prepended to the beginning of the schema.
* @param generateRecordBytes If true, a record bytes field will be appended to the beginning of the schema.
- * @param generateCorruptFields If true, a corrupt fields field will be appended to the end of the schema.
+ * @param corruptSchemaPolicy Specifies a policy to handle corrupt records. By default, null values will be produced and the original value is ignored. If the policy is set the '_corrput_fileds' field will be generated.
* @param inputFileNameField If non-empty, a source file name will be prepended to the beginning of the schema.
* @param generateSegIdFieldsCnt A number of segment ID levels to generate
* @param segmentIdProvidedPrefix A prefix for each segment id levels to make segment ids globally unique (by default the current timestamp will be used)
@@ -52,7 +52,7 @@ class CobolSchema(val copybook: Copybook,
val inputFileNameField: String,
val generateRecordId: Boolean,
val generateRecordBytes: Boolean,
- val generateCorruptFields: Boolean,
+ val corruptSchemaPolicy: CorruptFieldsPolicy,
val generateSegIdFieldsCnt: Int = 0,
val segmentIdProvidedPrefix: String = "",
val metadataPolicy: MetadataPolicy = MetadataPolicy.Basic) extends Serializable {
@@ -144,7 +144,7 @@ object CobolSchema {
readerParameters.inputFileNameColumn,
readerParameters.generateRecordId,
readerParameters.generateRecordBytes,
- readerParameters.generateCorruptFields,
+ readerParameters.corruptFieldsPolicy,
segIdFieldCount,
segmentIdPrefix,
readerParameters.metadataPolicy
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/builder/SparkCobolOptionsBuilder.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/builder/SparkCobolOptionsBuilder.scala
index d827e20b..e0450c23 100644
--- a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/builder/SparkCobolOptionsBuilder.scala
+++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/builder/SparkCobolOptionsBuilder.scala
@@ -20,7 +20,7 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions.GenericRow
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import za.co.absa.cobrix.cobol.reader.extractors.record.RecordExtractors
-import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
+import za.co.absa.cobrix.cobol.reader.parameters.{CorruptFieldsPolicy, ReaderParameters}
import za.co.absa.cobrix.cobol.reader.schema.{CobolSchema => CobolReaderSchema}
import za.co.absa.cobrix.spark.cobol.reader.RowHandler
import za.co.absa.cobrix.spark.cobol.schema.CobolSchema
@@ -62,6 +62,8 @@ class SparkCobolOptionsBuilder(copybookContent: String)(implicit spark: SparkSes
val minimumRecordLength = readerParams.minimumRecordLength
val maximumRecordLength = readerParams.maximumRecordLength
+ val generateCorruptFields = readerParams.corruptFieldsPolicy != CorruptFieldsPolicy.Disabled
+ val generateCorruptFieldsAsHex = readerParams.corruptFieldsPolicy == CorruptFieldsPolicy.Hex
val rddRow = rdd
.filter(array => array.nonEmpty && array.length >= minimumRecordLength && array.length <= maximumRecordLength)
@@ -73,7 +75,8 @@ class SparkCobolOptionsBuilder(copybookContent: String)(implicit spark: SparkSes
variableLengthOccurs = readerParams.variableSizeOccurs,
generateRecordId = readerParams.generateRecordId,
generateRecordBytes = readerParams.generateRecordBytes,
- generateCorruptFields = readerParams.generateCorruptFields,
+ generateCorruptFields = generateCorruptFields,
+ generateCorruptFieldsAsHex = generateCorruptFieldsAsHex,
handler = recordHandler)
Row.fromSeq(record)
})
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala
index ed1b6d60..f4c26331 100644
--- a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala
+++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala
@@ -25,7 +25,7 @@ import za.co.absa.cobrix.cobol.parser.common.Constants
import za.co.absa.cobrix.cobol.parser.encoding.RAW
import za.co.absa.cobrix.cobol.parser.policies.MetadataPolicy
import za.co.absa.cobrix.cobol.reader.parameters.CobolParametersParser.getReaderProperties
-import za.co.absa.cobrix.cobol.reader.parameters.{CobolParametersParser, Parameters}
+import za.co.absa.cobrix.cobol.reader.parameters.{CobolParametersParser, CorruptFieldsPolicy, Parameters}
import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy
import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaRetentionPolicy
import za.co.absa.cobrix.cobol.reader.schema.{CobolSchema => CobolReaderSchema}
@@ -44,7 +44,7 @@ import scala.collection.mutable.ArrayBuffer
* @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook.
* @param generateRecordId If true, a record id field will be prepended to the beginning of the schema.
* @param generateRecordBytes If true, a record bytes field will be appended to the beginning of the schema.
- * @param generateCorruptFields If true, a corrupt fields field will be appended to the end of the schema.
+ * @param corruptFieldsPolicy Specifies a policy to handle corrupt records. By default, null values will be produced and the original value is ignored. If the policy is set the '_corrput_fileds' field will be generated.
* @param inputFileNameField If non-empty, a source file name will be prepended to the beginning of the schema.
* @param generateSegIdFieldsCnt A number of segment ID levels to generate
* @param segmentIdProvidedPrefix A prefix for each segment id levels to make segment ids globally unique (by default the current timestamp will be used)
@@ -57,7 +57,7 @@ class CobolSchema(copybook: Copybook,
inputFileNameField: String,
generateRecordId: Boolean,
generateRecordBytes: Boolean,
- generateCorruptFields: Boolean,
+ corruptFieldsPolicy: CorruptFieldsPolicy,
generateSegIdFieldsCnt: Int,
segmentIdProvidedPrefix: String,
metadataPolicy: MetadataPolicy)
@@ -68,7 +68,7 @@ class CobolSchema(copybook: Copybook,
inputFileNameField,
generateRecordId,
generateRecordBytes,
- generateCorruptFields,
+ corruptFieldsPolicy,
generateSegIdFieldsCnt,
segmentIdProvidedPrefix
) with Logging with Serializable {
@@ -82,6 +82,7 @@ class CobolSchema(copybook: Copybook,
@throws(classOf[IllegalStateException])
private def createSparkSchema(): StructType = {
+ val generateCorruptFields = corruptFieldsPolicy != CorruptFieldsPolicy.Disabled
val records = for (record <- copybook.getRootRecords) yield {
val group = record.asInstanceOf[Group]
val redefines = copybook.getAllSegmentRedefines
@@ -130,10 +131,15 @@ class CobolSchema(copybook: Copybook,
}
val recordsWithCorruptFields = if (generateCorruptFields) {
+ val rawFieldType = if (corruptFieldsPolicy == CorruptFieldsPolicy.Hex) {
+ StringType
+ } else {
+ BinaryType
+ }
recordsWithRecordId :+ StructField(Constants.corruptFieldsField, ArrayType(StructType(
Seq(
StructField(Constants.fieldNameColumn, StringType, nullable = false),
- StructField(Constants.rawValueColumn, BinaryType, nullable = false)
+ StructField(Constants.rawValueColumn, rawFieldType, nullable = false)
)
), containsNull = false), nullable = true)
} else {
@@ -323,7 +329,7 @@ object CobolSchema {
schema.inputFileNameField,
schema.generateRecordId,
schema.generateRecordBytes,
- schema.generateCorruptFields,
+ schema.corruptSchemaPolicy,
schema.generateSegIdFieldsCnt,
schema.segmentIdPrefix,
schema.metadataPolicy
@@ -343,6 +349,7 @@ object CobolSchema {
class CobolSchemaBuilder(copybook: Copybook) {
private var schemaRetentionPolicy: SchemaRetentionPolicy = SchemaRetentionPolicy.CollapseRoot
private var isDisplayAlwaysString: Boolean = false
+ private var decodeBinaryAsHex: Boolean = false
private var strictIntegralPrecision: Boolean = false
private var inputFileNameField: String = ""
private var generateRecordId: Boolean = false
@@ -362,6 +369,11 @@ object CobolSchema {
this
}
+ def withDecodeBinaryAsHex(decodeBinaryAsHex: Boolean): CobolSchemaBuilder = {
+ this.decodeBinaryAsHex = decodeBinaryAsHex
+ this
+ }
+
def withStrictIntegralPrecision(strictIntegralPrecision: Boolean): CobolSchemaBuilder = {
this.strictIntegralPrecision = strictIntegralPrecision
this
@@ -403,6 +415,11 @@ object CobolSchema {
}
def build(): CobolSchema = {
+ val corruptFieldsPolicy = if (generateCorruptFields) {
+ if (decodeBinaryAsHex) CorruptFieldsPolicy.Hex else CorruptFieldsPolicy.Binary
+ } else {
+ CorruptFieldsPolicy.Disabled
+ }
new CobolSchema(
copybook,
schemaRetentionPolicy,
@@ -411,7 +428,7 @@ object CobolSchema {
inputFileNameField,
generateRecordId,
generateRecordBytes,
- generateCorruptFields,
+ corruptFieldsPolicy,
generateSegIdFieldsCnt,
segmentIdProvidedPrefix,
metadataPolicy
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala
index 92a5744d..59fc9757 100644
--- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala
@@ -869,7 +869,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
assert(sparkSchema.fields(1).dataType.isInstanceOf[StructType])
}
- "create schema with corrupt fields using builder" in {
+ "create schema with corrupt fields using builder with hex" in {
val copybook: String =
""" 01 RECORD.
| 05 STR1 PIC X(10).
@@ -877,6 +877,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
val parsedCopybook = CopybookParser.parse(copybook)
val cobolSchema = CobolSchema.builder(parsedCopybook)
.withGenerateCorruptFields(true)
+ .withDecodeBinaryAsHex(true)
.build()
val sparkSchema = cobolSchema.getSparkSchema
@@ -884,6 +885,26 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
assert(sparkSchema.fields.length == 2)
assert(sparkSchema.fields(1).name == "_corrupt_fields")
assert(sparkSchema.fields(1).dataType.isInstanceOf[ArrayType])
+ assert(sparkSchema.fields(1).dataType.asInstanceOf[ArrayType].elementType.asInstanceOf[StructType].fields(1).dataType == StringType)
+ }
+
+ "create schema with corrupt fields using builder with binary" in {
+ val copybook: String =
+ """ 01 RECORD.
+ | 05 STR1 PIC X(10).
+ |""".stripMargin
+ val parsedCopybook = CopybookParser.parse(copybook)
+ val cobolSchema = CobolSchema.builder(parsedCopybook)
+ .withGenerateCorruptFields(true)
+ .withDecodeBinaryAsHex(false)
+ .build()
+
+ val sparkSchema = cobolSchema.getSparkSchema
+
+ assert(sparkSchema.fields.length == 2)
+ assert(sparkSchema.fields(1).name == "_corrupt_fields")
+ assert(sparkSchema.fields(1).dataType.isInstanceOf[ArrayType])
+ assert(sparkSchema.fields(1).dataType.asInstanceOf[ArrayType].elementType.asInstanceOf[StructType].fields(1).dataType == BinaryType)
}
"create schema with various options" in {
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala
index 790420ca..b5def863 100644
--- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala
@@ -20,6 +20,7 @@ import org.apache.spark.sql.types.StructType
import za.co.absa.cobrix.cobol.parser.Copybook
import za.co.absa.cobrix.cobol.parser.ast.Group
import za.co.absa.cobrix.cobol.parser.policies.MetadataPolicy
+import za.co.absa.cobrix.cobol.reader.parameters.CorruptFieldsPolicy.Disabled
import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy
import za.co.absa.cobrix.spark.cobol.schema.CobolSchema
@@ -31,7 +32,7 @@ class DummyCobolSchema(val sparkSchema: StructType) extends CobolSchema(
"",
false,
false,
- false,
+ Disabled,
0,
"",
MetadataPolicy.Basic) with Serializable {
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test41CorruptFieldsSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test41CorruptFieldsSpec.scala
index a82476ea..cbbc934b 100644
--- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test41CorruptFieldsSpec.scala
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test41CorruptFieldsSpec.scala
@@ -18,7 +18,7 @@ package za.co.absa.cobrix.spark.cobol.source.integration
import org.apache.spark.sql.DataFrame
import org.scalatest.wordspec.AnyWordSpec
-import za.co.absa.cobrix.cobol.reader.parameters.CobolParametersParser.PARAM_CORRUPT_FIELDS
+import za.co.absa.cobrix.cobol.reader.parameters.CobolParametersParser.{PARAM_BINARY_AS_HEX, PARAM_CORRUPT_FIELDS}
import za.co.absa.cobrix.spark.cobol.source.base.SparkTestBase
import za.co.absa.cobrix.spark.cobol.source.fixtures.{BinaryFileFixture, TextComparisonFixture}
import za.co.absa.cobrix.spark.cobol.utils.SparkUtils
@@ -42,7 +42,7 @@ class Test41CorruptFieldsSpec extends AnyWordSpec with SparkTestBase with Binary
).map(_.toByte)
"Corrupt fields record generation" should {
- "work when the option is turned on" in {
+ "work when the option is turned on as binary" in {
val expectedSchema =
"""root
| |-- ID: integer (nullable = true)
@@ -113,6 +113,77 @@ class Test41CorruptFieldsSpec extends AnyWordSpec with SparkTestBase with Binary
}
}
+ "work when the option is turned on as hex" in {
+ val expectedSchema =
+ """root
+ | |-- ID: integer (nullable = true)
+ | |-- F1: string (nullable = true)
+ | |-- F2: integer (nullable = true)
+ | |-- F3: integer (nullable = true)
+ | |-- F4: array (nullable = true)
+ | | |-- element: integer (containsNull = true)
+ | |-- _corrupt_fields: array (nullable = true)
+ | | |-- element: struct (containsNull = false)
+ | | | |-- field_name: string (nullable = false)
+ | | | |-- raw_value: string (nullable = false)
+ |""".stripMargin
+
+ val expectedData =
+ """[ {
+ | "ID" : 1,
+ | "F1" : "",
+ | "F2" : 5,
+ | "F3" : 6,
+ | "F4" : [ 1, 2, 3 ],
+ | "_corrupt_fields" : [ ]
+ |}, {
+ | "ID" : 2,
+ | "F1" : "1",
+ | "F3" : 5,
+ | "F4" : [ 4, 5, 6 ],
+ | "_corrupt_fields" : [ {
+ | "field_name" : "F2",
+ | "raw_value" : "D3"
+ | } ]
+ |}, {
+ | "ID" : 3,
+ | "F2" : 3,
+ | "F3" : 61702,
+ | "F4" : [ 7, 8, 9 ],
+ | "_corrupt_fields" : [ ]
+ |}, {
+ | "ID" : 4,
+ | "F3" : 0,
+ | "F4" : [ null, null, 0 ],
+ | "_corrupt_fields" : [ ]
+ |}, {
+ | "ID" : 5,
+ | "F1" : "A",
+ | "F2" : 4,
+ | "F3" : 160,
+ | "F4" : [ null, 5, null ],
+ | "_corrupt_fields" : [ {
+ | "field_name" : "F4[0]",
+ | "raw_value" : "C1"
+ | }, {
+ | "field_name" : "F4[2]",
+ | "raw_value" : "A3"
+ | } ]
+ |} ]
+ |""".stripMargin
+
+ withTempBinFile("corrupt_fields1", ".dat", data) { tmpFileName =>
+ val df = getDataFrame(tmpFileName, Map(PARAM_CORRUPT_FIELDS -> "true", PARAM_BINARY_AS_HEX -> "true"))
+
+ val actualSchema = df.schema.treeString
+ compareTextVertical(actualSchema, expectedSchema)
+
+ val actualData = SparkUtils.convertDataFrameToPrettyJSON(df.orderBy("ID"), 10)
+
+ compareTextVertical(actualData, expectedData)
+ }
+ }
+
"throw an exception when working with a hierarchical data" in {
val ex = intercept[IllegalArgumentException] {
getDataFrame("/tmp/dummy", Map(PARAM_CORRUPT_FIELDS -> "true", "segment-children:0" -> "COMPANY => DEPT,CUSTOMER"))
From 865359ec555e6b1806407f8d4feb409eea8d9404 Mon Sep 17 00:00:00 2001
From: Ruslan Iushchenko
Date: Tue, 24 Feb 2026 10:52:50 +0100
Subject: [PATCH 2/3] #822 Fix PR suggestions and made Bin to HEX conversion
method more performant
---
.../extractors/record/RecordExtractors.scala | 17 ++++++++++++++++-
.../cobol/reader/schema/CobolSchema.scala | 2 +-
.../cobrix/spark/cobol/schema/CobolSchema.scala | 2 +-
.../cobrix/spark/cobol/RowExtractorSpec.scala | 7 +++++++
4 files changed, 25 insertions(+), 3 deletions(-)
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala
index c23ae957..ed782659 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala
@@ -521,7 +521,8 @@ object RecordExtractors {
var i = 0
while (i < len) {
val r = if (generateCorruptFieldsAsHex) {
- handler.create(Array[Any](corruptFields(i).fieldName, corruptFields(i).rawValue.map("%02X" format _).mkString), corruptFieldsGroup)
+ val hex = convertArrayToHex(corruptFields(i).rawValue)
+ handler.create(Array[Any](corruptFields(i).fieldName, hex), corruptFieldsGroup)
} else {
handler.create(Array[Any](corruptFields(i).fieldName, corruptFields(i).rawValue), corruptFieldsGroup)
}
@@ -534,6 +535,20 @@ object RecordExtractors {
// toList() is a constant time operation, and List implements immutable Seq, which is exactly what is needed here.
outputRecords.toList
}
+
+ private[cobrix] def convertArrayToHex(a: Array[Byte]): String = {
+ if (a == null) return ""
+ val hexChars = new Array[Char](a.length * 2)
+ val hexArray = Array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F')
+ var i = 0
+ while (i < a.length) {
+ val v = a(i) & 0xFF
+ hexChars(i * 2) = hexArray(v >>> 4)
+ hexChars(i * 2 + 1) = hexArray(v & 0x0F)
+ i += 1
+ }
+ new String(hexChars)
+ }
/**
* Constructs a Group object representing corrupt fields. It is only needed for constructing records that require field names,
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala
index 282e92e1..1fe4a8cc 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala
@@ -39,7 +39,7 @@ import scala.collection.immutable.HashMap
* @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook.
* @param generateRecordId If true, a record id field will be prepended to the beginning of the schema.
* @param generateRecordBytes If true, a record bytes field will be appended to the beginning of the schema.
- * @param corruptSchemaPolicy Specifies a policy to handle corrupt records. By default, null values will be produced and the original value is ignored. If the policy is set the '_corrput_fileds' field will be generated.
+ * @param corruptSchemaPolicy Specifies a policy to handle corrupt fields. By default, null values will be produced and the original value is ignored. If the policy is set the '_corrput_fields' field will be generated.
* @param inputFileNameField If non-empty, a source file name will be prepended to the beginning of the schema.
* @param generateSegIdFieldsCnt A number of segment ID levels to generate
* @param segmentIdProvidedPrefix A prefix for each segment id levels to make segment ids globally unique (by default the current timestamp will be used)
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala
index f4c26331..42dace51 100644
--- a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala
+++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala
@@ -44,7 +44,7 @@ import scala.collection.mutable.ArrayBuffer
* @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook.
* @param generateRecordId If true, a record id field will be prepended to the beginning of the schema.
* @param generateRecordBytes If true, a record bytes field will be appended to the beginning of the schema.
- * @param corruptFieldsPolicy Specifies a policy to handle corrupt records. By default, null values will be produced and the original value is ignored. If the policy is set the '_corrput_fileds' field will be generated.
+ * @param corruptFieldsPolicy Specifies a policy to handle corrupt fields. By default, null values will be produced and the original value is ignored. If the policy is set the '_corrput_fields' field will be generated.
* @param inputFileNameField If non-empty, a source file name will be prepended to the beginning of the schema.
* @param generateSegIdFieldsCnt A number of segment ID levels to generate
* @param segmentIdProvidedPrefix A prefix for each segment id levels to make segment ids globally unique (by default the current timestamp will be used)
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/RowExtractorSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/RowExtractorSpec.scala
index ec530bad..0c72258e 100644
--- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/RowExtractorSpec.scala
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/RowExtractorSpec.scala
@@ -141,4 +141,11 @@ class RowExtractorSpec extends AnyFunSuite {
//account type
assert(account(1).asInstanceOf[Int] === 0)
}
+
+ test("Byte array to HEX conversion") {
+ assert(RecordExtractors.convertArrayToHex(null) == "")
+ assert(RecordExtractors.convertArrayToHex(Array.empty) == "")
+ assert(RecordExtractors.convertArrayToHex(Array(1)) == "01")
+ assert(RecordExtractors.convertArrayToHex(bytes) == "0006C5E7C1D4D7D3C5F4404000000F40404040404040404040404040404040404040404040404040003FF0F0F0F0F0F0F0F0F0F0F0F0F0F0F2F0F0F0F4F0F0F0F1F200000FF0F0F0F0F0F0F0F0F0F0F0F0F0F0F3F0F0F0F4F0F0F1F0F200001FF0F0F0F0F0F0F0F0F5F0F0F6F0F0F1F2F0F0F3F0F1F0F0F000002F")
+ }
}
From 9af73e82b52c55a7ab37f3c139bef4f76693be24 Mon Sep 17 00:00:00 2001
From: Ruslan Iushchenko
Date: Tue, 24 Feb 2026 11:12:21 +0100
Subject: [PATCH 3/3] #822 Extract string to HEX conversion into a utility
object ensuring high performance implementation across its usages.
---
.../asttransform/DebugFieldsAdder.scala | 4 +-
.../parser/decoders/DecoderSelector.scala | 3 +-
.../parser/decoders/StringDecoders.scala | 21 ---------
...thRecordLengthExprRawRecordExtractor.scala | 9 ++--
.../extractors/record/RecordExtractors.scala | 17 +------
.../cobol/reader/schema/CobolSchema.scala | 2 +-
.../absa/cobrix/cobol/utils/StringUtils.scala | 44 +++++++++++++++++++
.../parser/decoders/StringDecodersSpec.scala | 8 ----
.../cobrix/cobol/utils/StringUtilsSuite.scala | 38 ++++++++++++++++
.../spark/cobol/schema/CobolSchema.scala | 2 +-
.../cobrix/spark/cobol/RowExtractorSpec.scala | 7 ---
11 files changed, 93 insertions(+), 62 deletions(-)
create mode 100644 cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/utils/StringUtils.scala
create mode 100644 cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/utils/StringUtilsSuite.scala
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/asttransform/DebugFieldsAdder.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/asttransform/DebugFieldsAdder.scala
index 5bfd7477..60815e94 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/asttransform/DebugFieldsAdder.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/asttransform/DebugFieldsAdder.scala
@@ -20,10 +20,10 @@ import za.co.absa.cobrix.cobol.parser.CopybookParser.CopybookAST
import za.co.absa.cobrix.cobol.parser.ast.datatype.AlphaNumeric
import za.co.absa.cobrix.cobol.parser.ast.{Group, Primitive, Statement}
import za.co.absa.cobrix.cobol.parser.decoders.StringDecoders
-import za.co.absa.cobrix.cobol.parser.decoders.StringDecoders.KeepAll
import za.co.absa.cobrix.cobol.parser.encoding._
import za.co.absa.cobrix.cobol.parser.policies.DebugFieldsPolicy
import za.co.absa.cobrix.cobol.parser.policies.DebugFieldsPolicy.DebugFieldsPolicy
+import za.co.absa.cobrix.cobol.utils.StringUtils
import scala.collection.mutable.ArrayBuffer
@@ -47,7 +47,7 @@ class DebugFieldsAdder(debugFieldsPolicy: DebugFieldsPolicy) extends AstTransfor
}
val debugDecoder = debugFieldsPolicy match {
- case DebugFieldsPolicy.HexValue => StringDecoders.decodeHex _
+ case DebugFieldsPolicy.HexValue => StringUtils.convertArrayToHex _
case DebugFieldsPolicy.RawValue => StringDecoders.decodeRaw _
case DebugFieldsPolicy.StringValue => (a: Array[Byte]) => new String(a)
case _ => throw new IllegalStateException(s"Unexpected debug fields policy: $debugFieldsPolicy.")
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala
index b8a4c30e..5d7d2584 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala
@@ -23,6 +23,7 @@ import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat.FloatingPoint
import za.co.absa.cobrix.cobol.parser.encoding._
import za.co.absa.cobrix.cobol.parser.encoding.codepage.{CodePage, CodePageCommon}
import za.co.absa.cobrix.cobol.parser.position.Position
+import za.co.absa.cobrix.cobol.utils.StringUtils
import java.nio.charset.{Charset, StandardCharsets}
import scala.util.control.NonFatal
@@ -94,7 +95,7 @@ object DecoderSelector {
case UTF16 =>
StringDecoders.decodeUtf16String(_, getStringStrimmingType(stringTrimmingPolicy), isUtf16BigEndian, improvedNullDetection)
case HEX =>
- StringDecoders.decodeHex
+ StringUtils.convertArrayToHex
case RAW =>
StringDecoders.decodeRaw
}
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecoders.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecoders.scala
index 8ca8e651..dcc40710 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecoders.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecoders.scala
@@ -33,9 +33,6 @@ object StringDecoders {
val TrimBoth = 4
val KeepAll = 5
- // Characters used for HEX conversion
- private val HEX_ARRAY = "0123456789ABCDEF".toCharArray
-
/**
* A decoder for any EBCDIC string fields (alphabetical or any char)
*
@@ -125,24 +122,6 @@ object StringDecoders {
}
}
- /**
- * A decoder for representing bytes as hex strings
- *
- * @param bytes A byte array that represents the binary data
- * @return A HEX string representation of the binary data
- */
- final def decodeHex(bytes: Array[Byte]): String = {
- val hexChars = new Array[Char](bytes.length * 2)
- var i = 0
- while (i < bytes.length) {
- val v = bytes(i) & 0xFF
- hexChars(i * 2) = HEX_ARRAY(v >>> 4)
- hexChars(i * 2 + 1) = HEX_ARRAY(v & 0x0F)
- i += 1
- }
- new String(hexChars)
- }
-
/**
* A decoder that doesn't decode, but just passes the bytes the way they are.
*
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/raw/FixedWithRecordLengthExprRawRecordExtractor.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/raw/FixedWithRecordLengthExprRawRecordExtractor.scala
index fc2a4026..df54983c 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/raw/FixedWithRecordLengthExprRawRecordExtractor.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/raw/FixedWithRecordLengthExprRawRecordExtractor.scala
@@ -21,6 +21,7 @@ import za.co.absa.cobrix.cobol.parser.ast.Primitive
import za.co.absa.cobrix.cobol.reader.iterator.RecordLengthExpression
import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
import za.co.absa.cobrix.cobol.reader.validator.ReaderParametersValidator
+import za.co.absa.cobrix.cobol.utils.StringUtils
import scala.util.Try
@@ -127,7 +128,7 @@ class FixedWithRecordLengthExprRawRecordExtractor(ctx: RawRecordContext,
case l: Long => l.toInt
case s: String => Try{ s.toInt }.getOrElse(throw new IllegalStateException(s"Record length value of the field ${lengthAST.name} must be an integral type, encountered: '$s'."))
case d: BigDecimal => d.toInt
- case null => throw new IllegalStateException(s"Null encountered as a record length field (offset: $byteIndex, raw value: ${getBytesAsHexString(binaryDataStart)}).")
+ case null => throw new IllegalStateException(s"Null encountered as a record length field (offset: $byteIndex, raw value: ${StringUtils.convertArrayToHex(binaryDataStart)}).")
case _ => throw new IllegalStateException(s"Record length value of the field ${lengthAST.name} must be an integral type.")
}
} else {
@@ -136,7 +137,7 @@ class FixedWithRecordLengthExprRawRecordExtractor(ctx: RawRecordContext,
case l: Long => getRecordLengthFromMapping(l.toString)
case d: BigDecimal => getRecordLengthFromMapping(d.toString())
case s: String => getRecordLengthFromMapping(s)
- case null => defaultRecordLength.getOrElse(throw new IllegalStateException(s"Null encountered as a record length field (offset: $byteIndex, raw value: ${getBytesAsHexString(binaryDataStart)})."))
+ case null => defaultRecordLength.getOrElse(throw new IllegalStateException(s"Null encountered as a record length field (offset: $byteIndex, raw value: ${StringUtils.convertArrayToHex(binaryDataStart)})."))
case _ => throw new IllegalStateException(s"Record length value of the field ${lengthAST.name} must be an integral type.")
}
}
@@ -150,10 +151,6 @@ class FixedWithRecordLengthExprRawRecordExtractor(ctx: RawRecordContext,
}
}
- final private def getBytesAsHexString(bytes: Array[Byte]): String = {
- bytes.map("%02X" format _).mkString
- }
-
private def fetchRecordUsingRecordLengthFieldExpression(expr: RecordLengthExpression): Option[Array[Byte]] = {
val lengthFieldBlock = expr.requiredBytesToread
val evaluator = expr.evaluator
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala
index ed782659..10b5f20a 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/record/RecordExtractors.scala
@@ -23,6 +23,7 @@ import za.co.absa.cobrix.cobol.parser.common.Constants
import za.co.absa.cobrix.cobol.parser.encoding.RAW
import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy
import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaRetentionPolicy
+import za.co.absa.cobrix.cobol.utils.StringUtils
import scala.collection.mutable
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
@@ -521,7 +522,7 @@ object RecordExtractors {
var i = 0
while (i < len) {
val r = if (generateCorruptFieldsAsHex) {
- val hex = convertArrayToHex(corruptFields(i).rawValue)
+ val hex = StringUtils.convertArrayToHex(corruptFields(i).rawValue)
handler.create(Array[Any](corruptFields(i).fieldName, hex), corruptFieldsGroup)
} else {
handler.create(Array[Any](corruptFields(i).fieldName, corruptFields(i).rawValue), corruptFieldsGroup)
@@ -536,20 +537,6 @@ object RecordExtractors {
outputRecords.toList
}
- private[cobrix] def convertArrayToHex(a: Array[Byte]): String = {
- if (a == null) return ""
- val hexChars = new Array[Char](a.length * 2)
- val hexArray = Array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F')
- var i = 0
- while (i < a.length) {
- val v = a(i) & 0xFF
- hexChars(i * 2) = hexArray(v >>> 4)
- hexChars(i * 2 + 1) = hexArray(v & 0x0F)
- i += 1
- }
- new String(hexChars)
- }
-
/**
* Constructs a Group object representing corrupt fields. It is only needed for constructing records that require field names,
* such as JSON. Field sizes and encoding do not really matter
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala
index 1fe4a8cc..b6e9e00e 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala
@@ -39,7 +39,7 @@ import scala.collection.immutable.HashMap
* @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook.
* @param generateRecordId If true, a record id field will be prepended to the beginning of the schema.
* @param generateRecordBytes If true, a record bytes field will be appended to the beginning of the schema.
- * @param corruptSchemaPolicy Specifies a policy to handle corrupt fields. By default, null values will be produced and the original value is ignored. If the policy is set the '_corrput_fields' field will be generated.
+ * @param corruptSchemaPolicy Specifies a policy to handle corrupt fields. By default, null values will be produced and the original value is ignored. If the policy is set the '_corrupt_fields' field will be generated.
* @param inputFileNameField If non-empty, a source file name will be prepended to the beginning of the schema.
* @param generateSegIdFieldsCnt A number of segment ID levels to generate
* @param segmentIdProvidedPrefix A prefix for each segment id levels to make segment ids globally unique (by default the current timestamp will be used)
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/utils/StringUtils.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/utils/StringUtils.scala
new file mode 100644
index 00000000..154fa187
--- /dev/null
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/utils/StringUtils.scala
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.cobol.utils
+
+object StringUtils {
+ // Characters used for HEX conversion
+ private final val HEX_ARRAY = Array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F')
+
+ /**
+ * Converts an array of bytes into a hexadecimal string representation.
+ *
+ * The main goal is the high CPU and memory efficiency of this method.
+ *
+ * @param a the input array of bytes to be converted. If the input is null, the method returns null.
+ * @return a string representing the hexadecimal equivalent of the input byte array, or null if the input is null.
+ */
+ def convertArrayToHex(a: Array[Byte]): String = {
+ if (a == null) return null
+ val hexArray = HEX_ARRAY
+ val hexChars = new Array[Char](a.length * 2)
+ var i = 0
+ while (i < a.length) {
+ val v = a(i) & 0xFF
+ hexChars(i * 2) = hexArray(v >>> 4)
+ hexChars(i * 2 + 1) = hexArray(v & 0x0F)
+ i += 1
+ }
+ new String(hexChars)
+ }
+}
diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala
index cc525e72..fc94eb23 100644
--- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala
+++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala
@@ -608,14 +608,6 @@ class StringDecodersSpec extends AnyWordSpec {
}
}
- "decodeHex()" should {
- "decode bytes as HEX strings" in {
- val hex = decodeHex(Array[Byte](0, 3, 16, 127, -1, -127))
-
- assert(hex == "0003107FFF81")
- }
- }
-
"decodeRaw()" should {
"should work on empty arrays" in {
val data = Array.empty[Byte]
diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/utils/StringUtilsSuite.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/utils/StringUtilsSuite.scala
new file mode 100644
index 00000000..30a20d43
--- /dev/null
+++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/utils/StringUtilsSuite.scala
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.cobol.utils
+
+import org.scalatest.wordspec.AnyWordSpec
+
+class StringUtilsSuite extends AnyWordSpec {
+ "StringUtils" should {
+ "convert byte array to hex string correctly" in {
+ val byteArray = Array[Byte](0x00, 0x01, 0x02, 0x0A, 0x0F, 0x10, 0x1F, 0x7F, 0x80.toByte, 0xFF.toByte)
+ val expectedHexString = "0001020A0F101F7F80FF"
+ assert(StringUtils.convertArrayToHex(byteArray) == expectedHexString)
+ }
+
+ "return empty string when input is an empty array" in {
+ assert(StringUtils.convertArrayToHex(Array.empty) == "")
+ }
+
+ "return null when input is null" in {
+ assert(StringUtils.convertArrayToHex(null) == null)
+ }
+ }
+
+}
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala
index 42dace51..238cc88f 100644
--- a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala
+++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala
@@ -44,7 +44,7 @@ import scala.collection.mutable.ArrayBuffer
* @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook.
* @param generateRecordId If true, a record id field will be prepended to the beginning of the schema.
* @param generateRecordBytes If true, a record bytes field will be appended to the beginning of the schema.
- * @param corruptFieldsPolicy Specifies a policy to handle corrupt fields. By default, null values will be produced and the original value is ignored. If the policy is set the '_corrput_fields' field will be generated.
+ * @param corruptFieldsPolicy Specifies a policy to handle corrupt fields. By default, null values will be produced and the original value is ignored. If the policy is set the '_corrupt_fields' field will be generated.
* @param inputFileNameField If non-empty, a source file name will be prepended to the beginning of the schema.
* @param generateSegIdFieldsCnt A number of segment ID levels to generate
* @param segmentIdProvidedPrefix A prefix for each segment id levels to make segment ids globally unique (by default the current timestamp will be used)
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/RowExtractorSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/RowExtractorSpec.scala
index 0c72258e..ec530bad 100644
--- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/RowExtractorSpec.scala
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/RowExtractorSpec.scala
@@ -141,11 +141,4 @@ class RowExtractorSpec extends AnyFunSuite {
//account type
assert(account(1).asInstanceOf[Int] === 0)
}
-
- test("Byte array to HEX conversion") {
- assert(RecordExtractors.convertArrayToHex(null) == "")
- assert(RecordExtractors.convertArrayToHex(Array.empty) == "")
- assert(RecordExtractors.convertArrayToHex(Array(1)) == "01")
- assert(RecordExtractors.convertArrayToHex(bytes) == "0006C5E7C1D4D7D3C5F4404000000F40404040404040404040404040404040404040404040404040003FF0F0F0F0F0F0F0F0F0F0F0F0F0F0F2F0F0F0F4F0F0F0F1F200000FF0F0F0F0F0F0F0F0F0F0F0F0F0F0F3F0F0F0F4F0F0F1F0F200001FF0F0F0F0F0F0F0F0F5F0F0F6F0F0F1F2F0F0F3F0F1F0F0F000002F")
- }
}