From e2d9eff80337714fc4fe6f0723393035b8bc28d4 Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Fri, 30 Jun 2023 12:19:21 -0400 Subject: [PATCH 01/24] Implement left join of two datasets with working test case. (cherry picked from commit 0cb0a58d1e777a4599012035bbd46f183356fd39) --- .../extensions/common/DatasetExtensions.kt | 59 +++++++++++++++++++ .../common/DatasetExtensionsTests.kt | 28 +++++++++ 2 files changed, 87 insertions(+) diff --git a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt index 5b31d60..a9c5130 100644 --- a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt +++ b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt @@ -73,6 +73,65 @@ object DatasetExtensions { return builder.build() } + @Suppress("unused") + @ScriptFunction(docBundlePrefix = "DatasetExtensions") + @KeywordArgs( + names = ["dataset", "dataset2", "columnIndex", "columnIndex2"], + types = [Dataset::class, Dataset::class, PyBaseString::class, PyBaseString::class], + ) + fun leftJoin(args: Array, keywords: Array): Dataset? { + val parsedArgs = PyArgParser.parseArgs( + args, + keywords, + arrayOf("dataset", "dataset2", "columnIndex", "columnIndex2"), + arrayOf(Dataset::class.java, Dataset::class.java, Int::class.java, Int::class.java), + "leftJoin", + ) + val dataset = parsedArgs.requirePyObject("dataset").toJava() + val dataset2 = parsedArgs.requirePyObject("dataset2").toJava() + val column = parsedArgs.requirePyObject("columnIndex").toJava() + val column2 = parsedArgs.requirePyObject("columnIndex2").toJava() + // Left Joins dataset to dataset2 on column = column2 + val columnName = List(dataset.columnCount) { col -> + dataset.columnNames[col] + } + val columnName2 = List(dataset2.columnCount) { col -> + dataset2.columnNames[col] + } + val combinedColumnName = columnName + columnName2 + val columnType = List>(dataset.columnCount) { col -> + dataset.columnTypes[col] + } + val columnType2 = List>(dataset2.columnCount) { col -> + dataset2.columnTypes[col] + } + val combinedColumnType = columnType + columnType2 + val builder = DatasetBuilder.newBuilder().colNames(combinedColumnName).colTypes(combinedColumnType) + + for (row in dataset.rowIndices) { + var found = false + val listToAppend = ArrayList() + for (row2 in dataset2.rowIndices) { + if (dataset[row, column] == dataset2[row2, column2]) { + found = true + break + } + } + for (col in 0 until dataset.columnCount) { + listToAppend.add(dataset[row, col]) + } + for (col in 0 until dataset2.columnCount) { + if (found) { + listToAppend.add(dataset2[row, col]) + } else { + listToAppend.add(null) + } + } + builder.addRow(*listToAppend.toTypedArray()) + } + return builder.build() + } + @Suppress("unused") @ScriptFunction(docBundlePrefix = "DatasetExtensions") @KeywordArgs( diff --git a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt index b7b2dce..8b62b03 100644 --- a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt +++ b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt @@ -22,6 +22,12 @@ class DatasetExtensionsTests : JythonTest( .addRow(1, 3.14, "pi") .addRow(2, 6.28, "tau") .build() + globals["dataset2"] = DatasetBuilder.newBuilder() + .colNames("a", "b2", "c2") + .colTypes(Int::class.javaObjectType, Double::class.javaObjectType, String::class.java) + .addRow(1, 3.1415, "pi2") + .addRow(2, 56, "tau2") + .build() val excelSample = DatasetExtensionsTests::class.java.getResourceAsStream("sample.xlsx")!!.readAllBytes() @@ -103,6 +109,28 @@ class DatasetExtensionsTests : JythonTest( } } + context("Left Join test") { + test("Left Join") { + eval("utils.leftJoin(dataset, dataset2, 0, 0)").asClue { + it.columnNames shouldBe listOf("a", "b", "c", "a", "b2", "c2") + it.columnTypes shouldBe listOf( + Int::class.javaObjectType, + Double::class.javaObjectType, + String::class.java, + Int::class.javaObjectType, + Double::class.javaObjectType, + String::class.java, + ) + it.rowCount shouldBe 2 + it.getColumnAsList(0) shouldBe listOf(1, 2) + it.getColumnAsList(1) shouldBe listOf(3.14, 6.28) + it.getColumnAsList(2) shouldBe listOf("pi", "tau") + it.getColumnAsList(3) shouldBe listOf(1, 2) + it.getColumnAsList(4) shouldBe listOf(3.1415, 56.0) + it.getColumnAsList(5) shouldBe listOf("pi2", "tau2") + } + } + } context("Filter tests") { test("Constant filter") { eval("utils.filter(dataset, lambda **kwargs: False)").asClue { From e5d2488190eed7770bc70769698c58dc64ea7ea1 Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Fri, 30 Jun 2023 12:25:16 -0400 Subject: [PATCH 02/24] Add description for leftJoin (cherry picked from commit 399777f1b6277f356d3bde53af3d56aac9f506e1) --- .../imdc/extensions/common/DatasetExtensions.properties | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties b/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties index b7a5530..e4306a7 100644 --- a/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties +++ b/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties @@ -4,6 +4,13 @@ map.param.mapper=A callable reference to invoke for each row. Will receive each map.param.preserveColumnTypes=True if the types of the output dataset should match the input. Otherwise, the output dataset will lose type information. map.returns=A modified dataset. +leftJoin.desc=Performs a left join on two datasets, returning a new dataset. +leftJoin.param.dataset=The first dataset. Must not be null. +leftJoin.param.dataset2=The second dataset. Must not be null. +leftJoin.param.columnIndex=Column index in the first dataset to join on. +leftJoin.param.columnIndex2=Column index in the second dataset to join on. +leftJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. + filter.desc=Runs a filtering function on each row in a dataset, returning a truncated dataset. filter.param.dataset=The dataset to filter. Must not be null. filter.param.filter=A function to run on each row. Will be called with keyword arguments matching column names. The first argument will be named 'row' and is the row index. Return True to keep the row in the output dataset. From a2a4991c939f777fbb184c0a6f5fa7c61469939e Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Fri, 30 Jun 2023 12:38:26 -0400 Subject: [PATCH 03/24] Change type of columnIndex to Int (cherry picked from commit b291f05b28e2f6f6c927755c9365ab643f833a33) --- .../main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt index a9c5130..ea2aecd 100644 --- a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt +++ b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt @@ -77,7 +77,7 @@ object DatasetExtensions { @ScriptFunction(docBundlePrefix = "DatasetExtensions") @KeywordArgs( names = ["dataset", "dataset2", "columnIndex", "columnIndex2"], - types = [Dataset::class, Dataset::class, PyBaseString::class, PyBaseString::class], + types = [Dataset::class, Dataset::class, Int::class, Int::class], ) fun leftJoin(args: Array, keywords: Array): Dataset? { val parsedArgs = PyArgParser.parseArgs( From 881d5843a7f5fbbc76af22fbac4a3416cef4dcb2 Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Fri, 30 Jun 2023 14:16:59 -0400 Subject: [PATCH 04/24] Set temp value (cherry picked from commit 83dc40528c50cc143617d054dc3afafad6c39fc4) --- .../org/imdc/extensions/common/DatasetExtensions.kt | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt index ea2aecd..b09fe21 100644 --- a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt +++ b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt @@ -111,9 +111,11 @@ object DatasetExtensions { for (row in dataset.rowIndices) { var found = false val listToAppend = ArrayList() - for (row2 in dataset2.rowIndices) { - if (dataset[row, column] == dataset2[row2, column2]) { + var row2: Int? = null // Declare row2 outside the loop so it can be used later + for (rowIndex in dataset2.rowIndices) { + if (dataset[row, column] == dataset2[rowIndex, column2]) { found = true + row2 = rowIndex break } } @@ -122,7 +124,7 @@ object DatasetExtensions { } for (col in 0 until dataset2.columnCount) { if (found) { - listToAppend.add(dataset2[row, col]) + listToAppend.add(dataset2[row2!!, col]) } else { listToAppend.add(null) } From 7c8d0815a9c1d119b001190d44614ffc1392d8a8 Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Fri, 30 Jun 2023 14:20:51 -0400 Subject: [PATCH 05/24] Clean up (cherry picked from commit 17ef1c5bd12ce62dd6f17a727c811777262c2caf) --- .../extensions/common/DatasetExtensions.kt | 53 +++++++++---------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt index b09fe21..dd7c2ac 100644 --- a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt +++ b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt @@ -91,46 +91,43 @@ object DatasetExtensions { val dataset2 = parsedArgs.requirePyObject("dataset2").toJava() val column = parsedArgs.requirePyObject("columnIndex").toJava() val column2 = parsedArgs.requirePyObject("columnIndex2").toJava() - // Left Joins dataset to dataset2 on column = column2 - val columnName = List(dataset.columnCount) { col -> - dataset.columnNames[col] - } - val columnName2 = List(dataset2.columnCount) { col -> - dataset2.columnNames[col] - } + + val columnName = dataset.columnNames.toList() + val columnName2 = dataset2.columnNames.toList() val combinedColumnName = columnName + columnName2 - val columnType = List>(dataset.columnCount) { col -> - dataset.columnTypes[col] - } - val columnType2 = List>(dataset2.columnCount) { col -> - dataset2.columnTypes[col] - } + + val columnType = dataset.columnTypes.toList() + val columnType2 = dataset2.columnTypes.toList() val combinedColumnType = columnType + columnType2 - val builder = DatasetBuilder.newBuilder().colNames(combinedColumnName).colTypes(combinedColumnType) + + val builder = DatasetBuilder.newBuilder() + .colNames(combinedColumnName) + .colTypes(combinedColumnType) for (row in dataset.rowIndices) { - var found = false - val listToAppend = ArrayList() - var row2: Int? = null // Declare row2 outside the loop so it can be used later - for (rowIndex in dataset2.rowIndices) { + val listToAppend = Array(combinedColumnName.size) { null } + var row2: Int? = null + + dataset2.rowIndices.forEachIndexed { rowIndex, _ -> if (dataset[row, column] == dataset2[rowIndex, column2]) { - found = true row2 = rowIndex - break + return@forEachIndexed } } - for (col in 0 until dataset.columnCount) { - listToAppend.add(dataset[row, col]) + + dataset.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[colIndex] = dataset[row, colIndex] } - for (col in 0 until dataset2.columnCount) { - if (found) { - listToAppend.add(dataset2[row2!!, col]) - } else { - listToAppend.add(null) + + if (row2 != null) { + dataset2.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[dataset.columnCount + colIndex] = dataset2[row2!!, colIndex] } } - builder.addRow(*listToAppend.toTypedArray()) + + builder.addRow(*listToAppend.copyOf(combinedColumnName.size)) } + return builder.build() } From 9577aa9e369c4a79f994d067b408e45e4c61d184 Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Fri, 30 Jun 2023 14:33:21 -0400 Subject: [PATCH 06/24] Clean up (cherry picked from commit 89a7acfe14d9474569db78b91393ae72a88e5a06) --- .../org/imdc/extensions/common/DatasetExtensions.kt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt index dd7c2ac..fcb54af 100644 --- a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt +++ b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt @@ -105,7 +105,7 @@ object DatasetExtensions { .colTypes(combinedColumnType) for (row in dataset.rowIndices) { - val listToAppend = Array(combinedColumnName.size) { null } + val listToAppend = arrayOfNulls(combinedColumnName.size) var row2: Int? = null dataset2.rowIndices.forEachIndexed { rowIndex, _ -> @@ -119,13 +119,13 @@ object DatasetExtensions { listToAppend[colIndex] = dataset[row, colIndex] } - if (row2 != null) { + row2?.let { r2 -> dataset2.columnIndices.forEachIndexed { colIndex, _ -> - listToAppend[dataset.columnCount + colIndex] = dataset2[row2!!, colIndex] + listToAppend[dataset.columnCount + colIndex] = dataset2[r2, colIndex] } } - builder.addRow(*listToAppend.copyOf(combinedColumnName.size)) + builder.addRow(*listToAppend) } return builder.build() From 51f3c0f73a7b4bba3e1af119085b6e04742f64e6 Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Wed, 5 Jul 2023 12:39:09 -0400 Subject: [PATCH 07/24] Add splitter of dataset (cherry picked from commit 78538522f18c030595c693e9a4a4d0af6682a597) --- .../extensions/common/DatasetExtensions.kt | 236 ++++++++++++++++++ .../common/DatasetExtensions.properties | 26 ++ .../common/DatasetExtensionsTests.kt | 111 ++++++++ 3 files changed, 373 insertions(+) diff --git a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt index fcb54af..4c7425a 100644 --- a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt +++ b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt @@ -131,6 +131,242 @@ object DatasetExtensions { return builder.build() } + @Suppress("unused") + @ScriptFunction(docBundlePrefix = "DatasetExtensions") + @KeywordArgs( + names = ["dataset", "dataset2", "columnIndex", "columnIndex2"], + types = [Dataset::class, Dataset::class, Int::class, Int::class], + ) + fun innerJoin(args: Array, keywords: Array): Dataset? { + val parsedArgs = PyArgParser.parseArgs( + args, + keywords, + arrayOf("dataset", "dataset2", "columnIndex", "columnIndex2"), + arrayOf(Dataset::class.java, Dataset::class.java, Int::class.java, Int::class.java), + "innerJoin", + ) + val dataset = parsedArgs.requirePyObject("dataset").toJava() + val dataset2 = parsedArgs.requirePyObject("dataset2").toJava() + val column = parsedArgs.requirePyObject("columnIndex").toJava() + val column2 = parsedArgs.requirePyObject("columnIndex2").toJava() + + val columnName = dataset.columnNames.toList() + val columnName2 = dataset2.columnNames.toList() + val combinedColumnName = columnName + columnName2 + + val columnType = dataset.columnTypes.toList() + val columnType2 = dataset2.columnTypes.toList() + val combinedColumnType = columnType + columnType2 + + val builder = DatasetBuilder.newBuilder() + .colNames(combinedColumnName) + .colTypes(combinedColumnType) + + for (row in dataset.rowIndices) { + val listToAppend = arrayOfNulls(combinedColumnName.size) + var row2: Int? = null + + dataset2.rowIndices.forEachIndexed { rowIndex, _ -> + if (dataset[row, column] == dataset2[rowIndex, column2]) { + row2 = rowIndex + return@forEachIndexed + } + } + + if (row2 != null) { + dataset.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[colIndex] = dataset[row, colIndex] + } + + dataset2.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[dataset.columnCount + colIndex] = dataset2[row2!!, colIndex] + } + + builder.addRow(*listToAppend) + } + } + + return builder.build() + } + + @Suppress("unused") + @ScriptFunction(docBundlePrefix = "DatasetExtensions") + @KeywordArgs( + names = ["dataset", "dataset2", "columnIndex", "columnIndex2"], + types = [Dataset::class, Dataset::class, Int::class, Int::class], + ) + fun rightJoin(args: Array, keywords: Array): Dataset? { + val parsedArgs = PyArgParser.parseArgs( + args, + keywords, + arrayOf("dataset", "dataset2", "columnIndex", "columnIndex2"), + arrayOf(Dataset::class.java, Dataset::class.java, Int::class.java, Int::class.java), + "rightJoin", + ) + val dataset = parsedArgs.requirePyObject("dataset").toJava() + val dataset2 = parsedArgs.requirePyObject("dataset2").toJava() + val column = parsedArgs.requirePyObject("columnIndex").toJava() + val column2 = parsedArgs.requirePyObject("columnIndex2").toJava() + + val columnName = dataset.columnNames.toList() + val columnName2 = dataset2.columnNames.toList() + val combinedColumnName = columnName + columnName2 + + val columnType = dataset.columnTypes.toList() + val columnType2 = dataset2.columnTypes.toList() + val combinedColumnType = columnType + columnType2 + + val builder = DatasetBuilder.newBuilder() + .colNames(combinedColumnName) + .colTypes(combinedColumnType) + + for (row in dataset2.rowIndices) { + val listToAppend = arrayOfNulls(combinedColumnName.size) + var row2: Int? = null + + dataset.rowIndices.forEachIndexed { rowIndex, _ -> + if (dataset[rowIndex, column] == dataset2[row, column2]) { + row2 = rowIndex + return@forEachIndexed + } + } + + if (row2 != null) { + dataset.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[colIndex] = dataset[row2!!, colIndex] + } + + dataset2.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[dataset.columnCount + colIndex] = dataset2[row, colIndex] + } + + builder.addRow(*listToAppend) + } + } + + return builder.build() + } + + @Suppress("unused") + @ScriptFunction(docBundlePrefix = "DatasetExtensions") + @KeywordArgs( + names = ["dataset", "dataset2", "columnIndex", "columnIndex2"], + types = [Dataset::class, Dataset::class, Int::class, Int::class], + ) + fun outerJoin(args: Array, keywords: Array): Dataset? { + val parsedArgs = PyArgParser.parseArgs( + args, + keywords, + arrayOf("dataset", "dataset2", "columnIndex", "columnIndex2"), + arrayOf(Dataset::class.java, Dataset::class.java, Int::class.java, Int::class.java), + "outerJoin", + ) + val dataset = parsedArgs.requirePyObject("dataset").toJava() + val dataset2 = parsedArgs.requirePyObject("dataset2").toJava() + val column = parsedArgs.requirePyObject("columnIndex").toJava() + val column2 = parsedArgs.requirePyObject("columnIndex2").toJava() + + val columnName = dataset.columnNames.toList() + val columnName2 = dataset2.columnNames.toList() + val combinedColumnName = columnName + columnName2 + + val columnType = dataset.columnTypes.toList() + val columnType2 = dataset2.columnTypes.toList() + val combinedColumnType = columnType + columnType2 + + val builder = DatasetBuilder.newBuilder() + .colNames(combinedColumnName) + .colTypes(combinedColumnType) + + for (row in dataset.rowIndices) { + val listToAppend = arrayOfNulls(combinedColumnName.size) + var row2: Int? = null + + dataset2.rowIndices.forEachIndexed { rowIndex, _ -> + if (dataset[row, column] == dataset2[rowIndex, column2]) { + row2 = rowIndex + return@forEachIndexed + } + } + + dataset.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[colIndex] = dataset[row, colIndex] + } + + row2?.let { r2 -> + dataset2.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[dataset.columnCount + colIndex] = dataset2[r2, colIndex] + } + } + + builder.addRow(*listToAppend) + } + + // Add unmatched rows from dataset2 + for (row2 in dataset2.rowIndices) { + val listToAppend = arrayOfNulls(combinedColumnName.size) + var row1: Int? = null + + dataset.rowIndices.forEachIndexed { rowIndex, _ -> + if (dataset[rowIndex, column] == dataset2[row2, column2]) { + row1 = rowIndex + return@forEachIndexed + } + } + + if (row1 == null) { + dataset2.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[dataset.columnCount + colIndex] = dataset2[row2, colIndex] + } + builder.addRow(*listToAppend) + } + } + + return builder.build() + } + + @Suppress("unused") + @ScriptFunction(docBundlePrefix = "DatasetExtensions") + @KeywordArgs( + names = ["dataset", "columnsToSplit"], + types = [Dataset::class, Array>::class], + ) + fun splitter(args: Array, keywords: Array): Array { + val parsedArgs = PyArgParser.parseArgs( + args, + keywords, + arrayOf("dataset", "columnsToSplit"), + arrayOf(Dataset::class.java, PyObject::class.java), + "splitter", + ) + val dataset = parsedArgs.requirePyObject("dataset").toJava() + val columnsToSplit = parsedArgs.requirePyObject("columnsToSplit").toJava>>() + val datasetSplit = Array(columnsToSplit.size) { null } + + for ((currentDataset, newDataSets) in columnsToSplit.withIndex()) { + val columnNames = mutableListOf() + val columnTypes = mutableListOf>() + + newDataSets.forEachIndexed { _, column -> + columnNames.add(dataset.columnNames[column]) + columnTypes.add(dataset.columnTypes[column]) + } + + val builder = DatasetBuilder.newBuilder() + .colNames(columnNames) + .colTypes(columnTypes) + + for (row in dataset.rowIndices) { + val listToAppend = newDataSets.map { dataset[row, it] }.toTypedArray() + builder.addRow(*listToAppend) + } + + datasetSplit[currentDataset] = builder.build() + } + + return datasetSplit + } + @Suppress("unused") @ScriptFunction(docBundlePrefix = "DatasetExtensions") @KeywordArgs( diff --git a/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties b/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties index e4306a7..f6eab57 100644 --- a/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties +++ b/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties @@ -11,6 +11,27 @@ leftJoin.param.columnIndex=Column index in the first dataset to join on. leftJoin.param.columnIndex2=Column index in the second dataset to join on. leftJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. +rightJoin.desc=Performs a right join on two datasets, returning a new dataset. +rightJoin.param.dataset=The first dataset. Must not be null. +rightJoin.param.dataset2=The second dataset. Must not be null. +rightJoin.param.columnIndex=Column index in the first dataset to join on. +rightJoin.param.columnIndex2=Column index in the second dataset to join on. +rightJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. + +innerJoin.desc=Performs a right join on two datasets, returning a new dataset. +innerJoin.param.dataset=The first dataset. Must not be null. +innerJoin.param.dataset2=The second dataset. Must not be null. +innerJoin.param.columnIndex=Column index in the first dataset to join on. +innerJoin.param.columnIndex2=Column index in the second dataset to join on. +innerJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. + +outerJoin.desc=Performs a right join on two datasets, returning a new dataset. +outerJoin.param.dataset=The first dataset. Must not be null. +outerJoin.param.dataset2=The second dataset. Must not be null. +outerJoin.param.columnIndex=Column index in the first dataset to join on. +outerJoin.param.columnIndex2=Column index in the second dataset to join on. +outerJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. + filter.desc=Runs a filtering function on each row in a dataset, returning a truncated dataset. filter.param.dataset=The dataset to filter. Must not be null. filter.param.filter=A function to run on each row. Will be called with keyword arguments matching column names. The first argument will be named 'row' and is the row index. Return True to keep the row in the output dataset. @@ -43,6 +64,11 @@ equals.param.dataset1=The first dataset. Must not be null. equals.param.dataset2=The second dataset. Must not be null. equals.returns=True if the two datasets have the same number of columns, with the same types, in the same order, with the same data in each row. +splitter.desc=Splits a dataset into any number of datasets, based on a list of columns. +splitter.param.dataset=Dataset to split. Must not be null. +splitter.param.columnsToSplit=List of columns that you would like to split dataset into +splitter.returns=List of datasets + valuesEqual.desc=Compares two datasets' content. valuesEqual.param.dataset1=The first dataset. Must not be null. valuesEqual.param.dataset2=The second dataset. Must not be null. diff --git a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt index 8b62b03..718aa03 100644 --- a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt +++ b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt @@ -3,6 +3,7 @@ package org.imdc.extensions.common import com.inductiveautomation.ignition.common.BasicDataset import com.inductiveautomation.ignition.common.Dataset import com.inductiveautomation.ignition.common.util.DatasetBuilder +import io.kotest.assertions.asClue import io.kotest.assertions.withClue import io.kotest.engine.spec.tempfile import io.kotest.matchers.shouldBe @@ -16,6 +17,32 @@ class DatasetExtensionsTests : JythonTest( { globals -> globals["utils"] = DatasetExtensions globals["builder"] = DatasetBuilder.newBuilder() + globals["outerJoin1"] = DatasetBuilder.newBuilder() + .colNames("EmpID", "EmpName", "City", "Designation") + .colTypes(Int::class.javaObjectType, String::class.java, String::class.java, String::class.java) + .addRow(1, "Charlotte Robinson", "Chicago", "Consultant") + .addRow(2, "Madison Phillips", "Dallas", "Senior Analyst") + .addRow(3, "Emma Hernandez", "Phoenix", "Senior Analyst") + .addRow(4, "Samantha Sanchez", "San Diego", "Principal Conultant") + .addRow(5, "Sadie Ward", "San Antonio", "Consultant") + .addRow(6, "Savannah Perez", "New York", "Principal Conultant") + .addRow(7, "Victoria Gray", "Los Angeles", "Assistant") + .addRow(8, "Alyssa Lewis", "Houston", "Consultant") + .addRow(9, "Anna Lee", "San Jose", "Principal Conultant") + .addRow(10, "Riley Hall", "Philadelphia", "Senior Analyst") + .build() + globals["outerJoin2"] = DatasetBuilder.newBuilder() + .colNames("EmpID", "Department_ID", "DepartmentName") + .colTypes(Int::class.javaObjectType, Int::class.javaObjectType, String::class.java) + .addRow(1, 0, "Executive") + .addRow(2, 1, "Document Control") + .addRow(3, 2, "Finance") + .addRow(4, 3, "Engineering") + .addRow(5, 4, "Facilities and Maintenance") + .addRow(6, 2, "Finance") + .addRow(10, 4, "Facilities and Maintenance") + .build() + globals["dataset"] = DatasetBuilder.newBuilder() .colNames("a", "b", "c") .colTypes(Int::class.javaObjectType, Double::class.javaObjectType, String::class.java) @@ -29,6 +56,12 @@ class DatasetExtensionsTests : JythonTest( .addRow(2, 56, "tau2") .build() + val tempArray: Array> = arrayOf( + arrayOf(0, 1, 2), + arrayOf(3), + ) + globals["splitAt"] = tempArray + val excelSample = DatasetExtensionsTests::class.java.getResourceAsStream("sample.xlsx")!!.readAllBytes() val tempXlsx = tempfile(suffix = "xlsx").also { @@ -131,6 +164,84 @@ class DatasetExtensionsTests : JythonTest( } } } + + // https://www.sqlshack.com/sql-outer-join-overview-and-examples/ + context("Outer Join test") { + test("Outer Join") { + eval("utils.outerJoin(outerJoin1, outerJoin2, 0, 0)").asClue { + it.columnNames shouldBe listOf("EmpID", "EmpName", "City", "Designation", "EmpID", "Department_ID", "DepartmentName") + it.columnTypes shouldBe listOf( + Int::class.javaObjectType, + String::class.java, + String::class.java, + String::class.java, + Int::class.javaObjectType, + Int::class.javaObjectType, + String::class.java, + ) + it.getColumnAsList(6) shouldBe listOf("Executive", "Document Control", "Finance", "Engineering", "Facilities and Maintenance", "Finance", null, null, null, "Facilities and Maintenance") + it.rowCount shouldBe 10 + } + } + } + + context("Right Join test") { + test("Right Join") { + eval("utils.rightJoin(outerJoin1, outerJoin2, 0, 0)").asClue { + it.columnNames shouldBe listOf("EmpID", "EmpName", "City", "Designation", "EmpID", "Department_ID", "DepartmentName") + it.columnTypes shouldBe listOf( + Int::class.javaObjectType, + String::class.java, + String::class.java, + String::class.java, + Int::class.javaObjectType, + Int::class.javaObjectType, + String::class.java, + ) + it.getColumnAsList(5) shouldBe listOf(0, 1, 2, 3, 4, 2, 4) + it.rowCount shouldBe 7 + } + } + } + + context("Inner Join test") { + test("Inner Join") { + eval("utils.innerJoin(outerJoin1, outerJoin2, 0, 0)").asClue { + it.columnNames shouldBe listOf("EmpID", "EmpName", "City", "Designation", "EmpID", "Department_ID", "DepartmentName") + it.columnTypes shouldBe listOf( + Int::class.javaObjectType, + String::class.java, + String::class.java, + String::class.java, + Int::class.javaObjectType, + Int::class.javaObjectType, + String::class.java, + ) + it.getColumnAsList(5) shouldBe listOf(0, 1, 2, 3, 4, 2, 4) + it.rowCount shouldBe 7 + } + } + } + + context("Dataset Splitter") { + test("Split Dataset") { + eval>("utils.splitter(outerJoin1, splitAt)").asClue { + it.get(0).columnNames shouldBe listOf("EmpID", "EmpName", "City") + it.get(0).columnTypes shouldBe listOf( + Int::class.javaObjectType, + String::class.java, + String::class.java, + ) + it.get(0).rowCount shouldBe 10 + it.get(1).columnNames shouldBe listOf("Designation") + it.get(1).columnTypes shouldBe listOf( + String::class.java, + ) + it.get(1).rowCount shouldBe 10 + } + } + } + context("Filter tests") { test("Constant filter") { eval("utils.filter(dataset, lambda **kwargs: False)").asClue { From 080eb6c5c9baa657da914c58a9cadab1693398c5 Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Wed, 5 Jul 2023 15:27:07 -0400 Subject: [PATCH 08/24] Update withClue (cherry picked from commit 292cb45e0c016f80cb59db2ca85e08ddfb95ca62) --- .../imdc/extensions/common/DatasetExtensionsTests.kt | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt index 718aa03..ebdbdca 100644 --- a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt +++ b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt @@ -95,10 +95,12 @@ class DatasetExtensionsTests : JythonTest( ) { private fun Dataset.asClue(assertions: (Dataset) -> Unit) { withClue( - lazy { - buildString { - printDataset(this, this@asClue, true) - } + { + lazy { + buildString { + printDataset(this, this@asClue, true) + } + }.value }, ) { assertions(this) From f443427b526b92fc22e4bb3227719a18415a1b6c Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Wed, 5 Jul 2023 16:09:18 -0400 Subject: [PATCH 09/24] Get rid of illegal reflective access warning (cherry picked from commit 512498d1eb49f0caf9eed44f263f7d47ec7e9c1c) --- common/build.gradle.kts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/build.gradle.kts b/common/build.gradle.kts index 856193a..8b2ea26 100644 --- a/common/build.gradle.kts +++ b/common/build.gradle.kts @@ -17,5 +17,7 @@ dependencies { tasks { withType { useJUnitPlatform() + + jvmArgs = listOf("--add-opens", "java.base/java.io=ALL-UNNAMED") } } From a91c538b522b1263a7d60b03c2929b9c5baf891a Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Tue, 11 Jul 2023 10:20:14 -0400 Subject: [PATCH 10/24] Get rid of illegal reflective access warning --- common/build.gradle.kts | 1 + .../imdc/extensions/common/DatasetExtensionsTests.kt | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/common/build.gradle.kts b/common/build.gradle.kts index 856193a..8df1fca 100644 --- a/common/build.gradle.kts +++ b/common/build.gradle.kts @@ -17,5 +17,6 @@ dependencies { tasks { withType { useJUnitPlatform() + jvmArgs = listOf("--add-opens", "java.base/java.io=ALL-UNNAMED") } } diff --git a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt index b7b2dce..32f9ced 100644 --- a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt +++ b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt @@ -56,10 +56,12 @@ class DatasetExtensionsTests : JythonTest( ) { private fun Dataset.asClue(assertions: (Dataset) -> Unit) { withClue( - lazy { - buildString { - printDataset(this, this@asClue, true) - } + { + lazy { + buildString { + printDataset(this, this@asClue, true) + } + }.value }, ) { assertions(this) From 8199b0ef79f35c45daf0fb65e473cbb9d1540218 Mon Sep 17 00:00:00 2001 From: Paul Griffith Date: Wed, 12 Jul 2023 11:49:38 -0700 Subject: [PATCH 11/24] Remove unnecessary lambda --- .../org/imdc/extensions/common/DatasetExtensionsTests.kt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt index 32f9ced..c3c8215 100644 --- a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt +++ b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt @@ -57,11 +57,9 @@ class DatasetExtensionsTests : JythonTest( private fun Dataset.asClue(assertions: (Dataset) -> Unit) { withClue( { - lazy { - buildString { - printDataset(this, this@asClue, true) - } - }.value + buildString { + printDataset(this, this@asClue, true) + } }, ) { assertions(this) From fdc341c74b916076ba12d6029b084fbc93aeec23 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Fri, 7 Jul 2023 10:33:59 +0000 Subject: [PATCH 12/24] Update inductiveautomation/ignition Docker tag to v8.1.29 --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index bec0d09..a3e3567 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,6 @@ services: gateway: - image: inductiveautomation/ignition:8.1.28 + image: inductiveautomation/ignition:8.1.29 ports: - 18088:8088 - 18000:8000 From bdb8087da5b03117b0e101e49fc1f0682db530f9 Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Fri, 30 Jun 2023 12:19:21 -0400 Subject: [PATCH 13/24] Implement left join of two datasets with working test case. (cherry picked from commit 0cb0a58d1e777a4599012035bbd46f183356fd39) --- .../extensions/common/DatasetExtensions.kt | 59 +++++++++++++++++++ .../common/DatasetExtensionsTests.kt | 28 +++++++++ 2 files changed, 87 insertions(+) diff --git a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt index 5b31d60..a9c5130 100644 --- a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt +++ b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt @@ -73,6 +73,65 @@ object DatasetExtensions { return builder.build() } + @Suppress("unused") + @ScriptFunction(docBundlePrefix = "DatasetExtensions") + @KeywordArgs( + names = ["dataset", "dataset2", "columnIndex", "columnIndex2"], + types = [Dataset::class, Dataset::class, PyBaseString::class, PyBaseString::class], + ) + fun leftJoin(args: Array, keywords: Array): Dataset? { + val parsedArgs = PyArgParser.parseArgs( + args, + keywords, + arrayOf("dataset", "dataset2", "columnIndex", "columnIndex2"), + arrayOf(Dataset::class.java, Dataset::class.java, Int::class.java, Int::class.java), + "leftJoin", + ) + val dataset = parsedArgs.requirePyObject("dataset").toJava() + val dataset2 = parsedArgs.requirePyObject("dataset2").toJava() + val column = parsedArgs.requirePyObject("columnIndex").toJava() + val column2 = parsedArgs.requirePyObject("columnIndex2").toJava() + // Left Joins dataset to dataset2 on column = column2 + val columnName = List(dataset.columnCount) { col -> + dataset.columnNames[col] + } + val columnName2 = List(dataset2.columnCount) { col -> + dataset2.columnNames[col] + } + val combinedColumnName = columnName + columnName2 + val columnType = List>(dataset.columnCount) { col -> + dataset.columnTypes[col] + } + val columnType2 = List>(dataset2.columnCount) { col -> + dataset2.columnTypes[col] + } + val combinedColumnType = columnType + columnType2 + val builder = DatasetBuilder.newBuilder().colNames(combinedColumnName).colTypes(combinedColumnType) + + for (row in dataset.rowIndices) { + var found = false + val listToAppend = ArrayList() + for (row2 in dataset2.rowIndices) { + if (dataset[row, column] == dataset2[row2, column2]) { + found = true + break + } + } + for (col in 0 until dataset.columnCount) { + listToAppend.add(dataset[row, col]) + } + for (col in 0 until dataset2.columnCount) { + if (found) { + listToAppend.add(dataset2[row, col]) + } else { + listToAppend.add(null) + } + } + builder.addRow(*listToAppend.toTypedArray()) + } + return builder.build() + } + @Suppress("unused") @ScriptFunction(docBundlePrefix = "DatasetExtensions") @KeywordArgs( diff --git a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt index c3c8215..e07aedd 100644 --- a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt +++ b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt @@ -22,6 +22,12 @@ class DatasetExtensionsTests : JythonTest( .addRow(1, 3.14, "pi") .addRow(2, 6.28, "tau") .build() + globals["dataset2"] = DatasetBuilder.newBuilder() + .colNames("a", "b2", "c2") + .colTypes(Int::class.javaObjectType, Double::class.javaObjectType, String::class.java) + .addRow(1, 3.1415, "pi2") + .addRow(2, 56, "tau2") + .build() val excelSample = DatasetExtensionsTests::class.java.getResourceAsStream("sample.xlsx")!!.readAllBytes() @@ -103,6 +109,28 @@ class DatasetExtensionsTests : JythonTest( } } + context("Left Join test") { + test("Left Join") { + eval("utils.leftJoin(dataset, dataset2, 0, 0)").asClue { + it.columnNames shouldBe listOf("a", "b", "c", "a", "b2", "c2") + it.columnTypes shouldBe listOf( + Int::class.javaObjectType, + Double::class.javaObjectType, + String::class.java, + Int::class.javaObjectType, + Double::class.javaObjectType, + String::class.java, + ) + it.rowCount shouldBe 2 + it.getColumnAsList(0) shouldBe listOf(1, 2) + it.getColumnAsList(1) shouldBe listOf(3.14, 6.28) + it.getColumnAsList(2) shouldBe listOf("pi", "tau") + it.getColumnAsList(3) shouldBe listOf(1, 2) + it.getColumnAsList(4) shouldBe listOf(3.1415, 56.0) + it.getColumnAsList(5) shouldBe listOf("pi2", "tau2") + } + } + } context("Filter tests") { test("Constant filter") { eval("utils.filter(dataset, lambda **kwargs: False)").asClue { From df77faf6428c867e93f9894ba5a48e2414596bfb Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Fri, 30 Jun 2023 12:25:16 -0400 Subject: [PATCH 14/24] Add description for leftJoin (cherry picked from commit 399777f1b6277f356d3bde53af3d56aac9f506e1) --- .../imdc/extensions/common/DatasetExtensions.properties | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties b/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties index b7a5530..e4306a7 100644 --- a/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties +++ b/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties @@ -4,6 +4,13 @@ map.param.mapper=A callable reference to invoke for each row. Will receive each map.param.preserveColumnTypes=True if the types of the output dataset should match the input. Otherwise, the output dataset will lose type information. map.returns=A modified dataset. +leftJoin.desc=Performs a left join on two datasets, returning a new dataset. +leftJoin.param.dataset=The first dataset. Must not be null. +leftJoin.param.dataset2=The second dataset. Must not be null. +leftJoin.param.columnIndex=Column index in the first dataset to join on. +leftJoin.param.columnIndex2=Column index in the second dataset to join on. +leftJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. + filter.desc=Runs a filtering function on each row in a dataset, returning a truncated dataset. filter.param.dataset=The dataset to filter. Must not be null. filter.param.filter=A function to run on each row. Will be called with keyword arguments matching column names. The first argument will be named 'row' and is the row index. Return True to keep the row in the output dataset. From 8b64dbe66fd55945b54d0de9a3fb4b31e3f9e18c Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Fri, 30 Jun 2023 12:38:26 -0400 Subject: [PATCH 15/24] Change type of columnIndex to Int (cherry picked from commit b291f05b28e2f6f6c927755c9365ab643f833a33) --- .../main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt index a9c5130..ea2aecd 100644 --- a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt +++ b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt @@ -77,7 +77,7 @@ object DatasetExtensions { @ScriptFunction(docBundlePrefix = "DatasetExtensions") @KeywordArgs( names = ["dataset", "dataset2", "columnIndex", "columnIndex2"], - types = [Dataset::class, Dataset::class, PyBaseString::class, PyBaseString::class], + types = [Dataset::class, Dataset::class, Int::class, Int::class], ) fun leftJoin(args: Array, keywords: Array): Dataset? { val parsedArgs = PyArgParser.parseArgs( From 9920015c46b3764f074060801b0765f639c23af6 Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Fri, 30 Jun 2023 14:16:59 -0400 Subject: [PATCH 16/24] Set temp value (cherry picked from commit 83dc40528c50cc143617d054dc3afafad6c39fc4) --- .../org/imdc/extensions/common/DatasetExtensions.kt | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt index ea2aecd..b09fe21 100644 --- a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt +++ b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt @@ -111,9 +111,11 @@ object DatasetExtensions { for (row in dataset.rowIndices) { var found = false val listToAppend = ArrayList() - for (row2 in dataset2.rowIndices) { - if (dataset[row, column] == dataset2[row2, column2]) { + var row2: Int? = null // Declare row2 outside the loop so it can be used later + for (rowIndex in dataset2.rowIndices) { + if (dataset[row, column] == dataset2[rowIndex, column2]) { found = true + row2 = rowIndex break } } @@ -122,7 +124,7 @@ object DatasetExtensions { } for (col in 0 until dataset2.columnCount) { if (found) { - listToAppend.add(dataset2[row, col]) + listToAppend.add(dataset2[row2!!, col]) } else { listToAppend.add(null) } From ea28eb348fc20bd56877899680a24f10d091097d Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Fri, 30 Jun 2023 14:20:51 -0400 Subject: [PATCH 17/24] Clean up (cherry picked from commit 17ef1c5bd12ce62dd6f17a727c811777262c2caf) --- .../extensions/common/DatasetExtensions.kt | 53 +++++++++---------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt index b09fe21..dd7c2ac 100644 --- a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt +++ b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt @@ -91,46 +91,43 @@ object DatasetExtensions { val dataset2 = parsedArgs.requirePyObject("dataset2").toJava() val column = parsedArgs.requirePyObject("columnIndex").toJava() val column2 = parsedArgs.requirePyObject("columnIndex2").toJava() - // Left Joins dataset to dataset2 on column = column2 - val columnName = List(dataset.columnCount) { col -> - dataset.columnNames[col] - } - val columnName2 = List(dataset2.columnCount) { col -> - dataset2.columnNames[col] - } + + val columnName = dataset.columnNames.toList() + val columnName2 = dataset2.columnNames.toList() val combinedColumnName = columnName + columnName2 - val columnType = List>(dataset.columnCount) { col -> - dataset.columnTypes[col] - } - val columnType2 = List>(dataset2.columnCount) { col -> - dataset2.columnTypes[col] - } + + val columnType = dataset.columnTypes.toList() + val columnType2 = dataset2.columnTypes.toList() val combinedColumnType = columnType + columnType2 - val builder = DatasetBuilder.newBuilder().colNames(combinedColumnName).colTypes(combinedColumnType) + + val builder = DatasetBuilder.newBuilder() + .colNames(combinedColumnName) + .colTypes(combinedColumnType) for (row in dataset.rowIndices) { - var found = false - val listToAppend = ArrayList() - var row2: Int? = null // Declare row2 outside the loop so it can be used later - for (rowIndex in dataset2.rowIndices) { + val listToAppend = Array(combinedColumnName.size) { null } + var row2: Int? = null + + dataset2.rowIndices.forEachIndexed { rowIndex, _ -> if (dataset[row, column] == dataset2[rowIndex, column2]) { - found = true row2 = rowIndex - break + return@forEachIndexed } } - for (col in 0 until dataset.columnCount) { - listToAppend.add(dataset[row, col]) + + dataset.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[colIndex] = dataset[row, colIndex] } - for (col in 0 until dataset2.columnCount) { - if (found) { - listToAppend.add(dataset2[row2!!, col]) - } else { - listToAppend.add(null) + + if (row2 != null) { + dataset2.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[dataset.columnCount + colIndex] = dataset2[row2!!, colIndex] } } - builder.addRow(*listToAppend.toTypedArray()) + + builder.addRow(*listToAppend.copyOf(combinedColumnName.size)) } + return builder.build() } From 8d32fad4be4935be60543e99fb2da65cc207c2d4 Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Fri, 30 Jun 2023 14:33:21 -0400 Subject: [PATCH 18/24] Clean up (cherry picked from commit 89a7acfe14d9474569db78b91393ae72a88e5a06) --- .../org/imdc/extensions/common/DatasetExtensions.kt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt index dd7c2ac..fcb54af 100644 --- a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt +++ b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt @@ -105,7 +105,7 @@ object DatasetExtensions { .colTypes(combinedColumnType) for (row in dataset.rowIndices) { - val listToAppend = Array(combinedColumnName.size) { null } + val listToAppend = arrayOfNulls(combinedColumnName.size) var row2: Int? = null dataset2.rowIndices.forEachIndexed { rowIndex, _ -> @@ -119,13 +119,13 @@ object DatasetExtensions { listToAppend[colIndex] = dataset[row, colIndex] } - if (row2 != null) { + row2?.let { r2 -> dataset2.columnIndices.forEachIndexed { colIndex, _ -> - listToAppend[dataset.columnCount + colIndex] = dataset2[row2!!, colIndex] + listToAppend[dataset.columnCount + colIndex] = dataset2[r2, colIndex] } } - builder.addRow(*listToAppend.copyOf(combinedColumnName.size)) + builder.addRow(*listToAppend) } return builder.build() From 9aa819a31cd47337b5db0fd2a753b3378f225f7d Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Wed, 5 Jul 2023 12:39:09 -0400 Subject: [PATCH 19/24] Add splitter of dataset (cherry picked from commit 78538522f18c030595c693e9a4a4d0af6682a597) --- .../extensions/common/DatasetExtensions.kt | 236 ++++++++++++++++++ .../common/DatasetExtensions.properties | 26 ++ .../common/DatasetExtensionsTests.kt | 111 ++++++++ 3 files changed, 373 insertions(+) diff --git a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt index fcb54af..4c7425a 100644 --- a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt +++ b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt @@ -131,6 +131,242 @@ object DatasetExtensions { return builder.build() } + @Suppress("unused") + @ScriptFunction(docBundlePrefix = "DatasetExtensions") + @KeywordArgs( + names = ["dataset", "dataset2", "columnIndex", "columnIndex2"], + types = [Dataset::class, Dataset::class, Int::class, Int::class], + ) + fun innerJoin(args: Array, keywords: Array): Dataset? { + val parsedArgs = PyArgParser.parseArgs( + args, + keywords, + arrayOf("dataset", "dataset2", "columnIndex", "columnIndex2"), + arrayOf(Dataset::class.java, Dataset::class.java, Int::class.java, Int::class.java), + "innerJoin", + ) + val dataset = parsedArgs.requirePyObject("dataset").toJava() + val dataset2 = parsedArgs.requirePyObject("dataset2").toJava() + val column = parsedArgs.requirePyObject("columnIndex").toJava() + val column2 = parsedArgs.requirePyObject("columnIndex2").toJava() + + val columnName = dataset.columnNames.toList() + val columnName2 = dataset2.columnNames.toList() + val combinedColumnName = columnName + columnName2 + + val columnType = dataset.columnTypes.toList() + val columnType2 = dataset2.columnTypes.toList() + val combinedColumnType = columnType + columnType2 + + val builder = DatasetBuilder.newBuilder() + .colNames(combinedColumnName) + .colTypes(combinedColumnType) + + for (row in dataset.rowIndices) { + val listToAppend = arrayOfNulls(combinedColumnName.size) + var row2: Int? = null + + dataset2.rowIndices.forEachIndexed { rowIndex, _ -> + if (dataset[row, column] == dataset2[rowIndex, column2]) { + row2 = rowIndex + return@forEachIndexed + } + } + + if (row2 != null) { + dataset.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[colIndex] = dataset[row, colIndex] + } + + dataset2.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[dataset.columnCount + colIndex] = dataset2[row2!!, colIndex] + } + + builder.addRow(*listToAppend) + } + } + + return builder.build() + } + + @Suppress("unused") + @ScriptFunction(docBundlePrefix = "DatasetExtensions") + @KeywordArgs( + names = ["dataset", "dataset2", "columnIndex", "columnIndex2"], + types = [Dataset::class, Dataset::class, Int::class, Int::class], + ) + fun rightJoin(args: Array, keywords: Array): Dataset? { + val parsedArgs = PyArgParser.parseArgs( + args, + keywords, + arrayOf("dataset", "dataset2", "columnIndex", "columnIndex2"), + arrayOf(Dataset::class.java, Dataset::class.java, Int::class.java, Int::class.java), + "rightJoin", + ) + val dataset = parsedArgs.requirePyObject("dataset").toJava() + val dataset2 = parsedArgs.requirePyObject("dataset2").toJava() + val column = parsedArgs.requirePyObject("columnIndex").toJava() + val column2 = parsedArgs.requirePyObject("columnIndex2").toJava() + + val columnName = dataset.columnNames.toList() + val columnName2 = dataset2.columnNames.toList() + val combinedColumnName = columnName + columnName2 + + val columnType = dataset.columnTypes.toList() + val columnType2 = dataset2.columnTypes.toList() + val combinedColumnType = columnType + columnType2 + + val builder = DatasetBuilder.newBuilder() + .colNames(combinedColumnName) + .colTypes(combinedColumnType) + + for (row in dataset2.rowIndices) { + val listToAppend = arrayOfNulls(combinedColumnName.size) + var row2: Int? = null + + dataset.rowIndices.forEachIndexed { rowIndex, _ -> + if (dataset[rowIndex, column] == dataset2[row, column2]) { + row2 = rowIndex + return@forEachIndexed + } + } + + if (row2 != null) { + dataset.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[colIndex] = dataset[row2!!, colIndex] + } + + dataset2.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[dataset.columnCount + colIndex] = dataset2[row, colIndex] + } + + builder.addRow(*listToAppend) + } + } + + return builder.build() + } + + @Suppress("unused") + @ScriptFunction(docBundlePrefix = "DatasetExtensions") + @KeywordArgs( + names = ["dataset", "dataset2", "columnIndex", "columnIndex2"], + types = [Dataset::class, Dataset::class, Int::class, Int::class], + ) + fun outerJoin(args: Array, keywords: Array): Dataset? { + val parsedArgs = PyArgParser.parseArgs( + args, + keywords, + arrayOf("dataset", "dataset2", "columnIndex", "columnIndex2"), + arrayOf(Dataset::class.java, Dataset::class.java, Int::class.java, Int::class.java), + "outerJoin", + ) + val dataset = parsedArgs.requirePyObject("dataset").toJava() + val dataset2 = parsedArgs.requirePyObject("dataset2").toJava() + val column = parsedArgs.requirePyObject("columnIndex").toJava() + val column2 = parsedArgs.requirePyObject("columnIndex2").toJava() + + val columnName = dataset.columnNames.toList() + val columnName2 = dataset2.columnNames.toList() + val combinedColumnName = columnName + columnName2 + + val columnType = dataset.columnTypes.toList() + val columnType2 = dataset2.columnTypes.toList() + val combinedColumnType = columnType + columnType2 + + val builder = DatasetBuilder.newBuilder() + .colNames(combinedColumnName) + .colTypes(combinedColumnType) + + for (row in dataset.rowIndices) { + val listToAppend = arrayOfNulls(combinedColumnName.size) + var row2: Int? = null + + dataset2.rowIndices.forEachIndexed { rowIndex, _ -> + if (dataset[row, column] == dataset2[rowIndex, column2]) { + row2 = rowIndex + return@forEachIndexed + } + } + + dataset.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[colIndex] = dataset[row, colIndex] + } + + row2?.let { r2 -> + dataset2.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[dataset.columnCount + colIndex] = dataset2[r2, colIndex] + } + } + + builder.addRow(*listToAppend) + } + + // Add unmatched rows from dataset2 + for (row2 in dataset2.rowIndices) { + val listToAppend = arrayOfNulls(combinedColumnName.size) + var row1: Int? = null + + dataset.rowIndices.forEachIndexed { rowIndex, _ -> + if (dataset[rowIndex, column] == dataset2[row2, column2]) { + row1 = rowIndex + return@forEachIndexed + } + } + + if (row1 == null) { + dataset2.columnIndices.forEachIndexed { colIndex, _ -> + listToAppend[dataset.columnCount + colIndex] = dataset2[row2, colIndex] + } + builder.addRow(*listToAppend) + } + } + + return builder.build() + } + + @Suppress("unused") + @ScriptFunction(docBundlePrefix = "DatasetExtensions") + @KeywordArgs( + names = ["dataset", "columnsToSplit"], + types = [Dataset::class, Array>::class], + ) + fun splitter(args: Array, keywords: Array): Array { + val parsedArgs = PyArgParser.parseArgs( + args, + keywords, + arrayOf("dataset", "columnsToSplit"), + arrayOf(Dataset::class.java, PyObject::class.java), + "splitter", + ) + val dataset = parsedArgs.requirePyObject("dataset").toJava() + val columnsToSplit = parsedArgs.requirePyObject("columnsToSplit").toJava>>() + val datasetSplit = Array(columnsToSplit.size) { null } + + for ((currentDataset, newDataSets) in columnsToSplit.withIndex()) { + val columnNames = mutableListOf() + val columnTypes = mutableListOf>() + + newDataSets.forEachIndexed { _, column -> + columnNames.add(dataset.columnNames[column]) + columnTypes.add(dataset.columnTypes[column]) + } + + val builder = DatasetBuilder.newBuilder() + .colNames(columnNames) + .colTypes(columnTypes) + + for (row in dataset.rowIndices) { + val listToAppend = newDataSets.map { dataset[row, it] }.toTypedArray() + builder.addRow(*listToAppend) + } + + datasetSplit[currentDataset] = builder.build() + } + + return datasetSplit + } + @Suppress("unused") @ScriptFunction(docBundlePrefix = "DatasetExtensions") @KeywordArgs( diff --git a/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties b/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties index e4306a7..f6eab57 100644 --- a/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties +++ b/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties @@ -11,6 +11,27 @@ leftJoin.param.columnIndex=Column index in the first dataset to join on. leftJoin.param.columnIndex2=Column index in the second dataset to join on. leftJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. +rightJoin.desc=Performs a right join on two datasets, returning a new dataset. +rightJoin.param.dataset=The first dataset. Must not be null. +rightJoin.param.dataset2=The second dataset. Must not be null. +rightJoin.param.columnIndex=Column index in the first dataset to join on. +rightJoin.param.columnIndex2=Column index in the second dataset to join on. +rightJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. + +innerJoin.desc=Performs a right join on two datasets, returning a new dataset. +innerJoin.param.dataset=The first dataset. Must not be null. +innerJoin.param.dataset2=The second dataset. Must not be null. +innerJoin.param.columnIndex=Column index in the first dataset to join on. +innerJoin.param.columnIndex2=Column index in the second dataset to join on. +innerJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. + +outerJoin.desc=Performs a right join on two datasets, returning a new dataset. +outerJoin.param.dataset=The first dataset. Must not be null. +outerJoin.param.dataset2=The second dataset. Must not be null. +outerJoin.param.columnIndex=Column index in the first dataset to join on. +outerJoin.param.columnIndex2=Column index in the second dataset to join on. +outerJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. + filter.desc=Runs a filtering function on each row in a dataset, returning a truncated dataset. filter.param.dataset=The dataset to filter. Must not be null. filter.param.filter=A function to run on each row. Will be called with keyword arguments matching column names. The first argument will be named 'row' and is the row index. Return True to keep the row in the output dataset. @@ -43,6 +64,11 @@ equals.param.dataset1=The first dataset. Must not be null. equals.param.dataset2=The second dataset. Must not be null. equals.returns=True if the two datasets have the same number of columns, with the same types, in the same order, with the same data in each row. +splitter.desc=Splits a dataset into any number of datasets, based on a list of columns. +splitter.param.dataset=Dataset to split. Must not be null. +splitter.param.columnsToSplit=List of columns that you would like to split dataset into +splitter.returns=List of datasets + valuesEqual.desc=Compares two datasets' content. valuesEqual.param.dataset1=The first dataset. Must not be null. valuesEqual.param.dataset2=The second dataset. Must not be null. diff --git a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt index e07aedd..b1dcabb 100644 --- a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt +++ b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt @@ -3,6 +3,7 @@ package org.imdc.extensions.common import com.inductiveautomation.ignition.common.BasicDataset import com.inductiveautomation.ignition.common.Dataset import com.inductiveautomation.ignition.common.util.DatasetBuilder +import io.kotest.assertions.asClue import io.kotest.assertions.withClue import io.kotest.engine.spec.tempfile import io.kotest.matchers.shouldBe @@ -16,6 +17,32 @@ class DatasetExtensionsTests : JythonTest( { globals -> globals["utils"] = DatasetExtensions globals["builder"] = DatasetBuilder.newBuilder() + globals["outerJoin1"] = DatasetBuilder.newBuilder() + .colNames("EmpID", "EmpName", "City", "Designation") + .colTypes(Int::class.javaObjectType, String::class.java, String::class.java, String::class.java) + .addRow(1, "Charlotte Robinson", "Chicago", "Consultant") + .addRow(2, "Madison Phillips", "Dallas", "Senior Analyst") + .addRow(3, "Emma Hernandez", "Phoenix", "Senior Analyst") + .addRow(4, "Samantha Sanchez", "San Diego", "Principal Conultant") + .addRow(5, "Sadie Ward", "San Antonio", "Consultant") + .addRow(6, "Savannah Perez", "New York", "Principal Conultant") + .addRow(7, "Victoria Gray", "Los Angeles", "Assistant") + .addRow(8, "Alyssa Lewis", "Houston", "Consultant") + .addRow(9, "Anna Lee", "San Jose", "Principal Conultant") + .addRow(10, "Riley Hall", "Philadelphia", "Senior Analyst") + .build() + globals["outerJoin2"] = DatasetBuilder.newBuilder() + .colNames("EmpID", "Department_ID", "DepartmentName") + .colTypes(Int::class.javaObjectType, Int::class.javaObjectType, String::class.java) + .addRow(1, 0, "Executive") + .addRow(2, 1, "Document Control") + .addRow(3, 2, "Finance") + .addRow(4, 3, "Engineering") + .addRow(5, 4, "Facilities and Maintenance") + .addRow(6, 2, "Finance") + .addRow(10, 4, "Facilities and Maintenance") + .build() + globals["dataset"] = DatasetBuilder.newBuilder() .colNames("a", "b", "c") .colTypes(Int::class.javaObjectType, Double::class.javaObjectType, String::class.java) @@ -29,6 +56,12 @@ class DatasetExtensionsTests : JythonTest( .addRow(2, 56, "tau2") .build() + val tempArray: Array> = arrayOf( + arrayOf(0, 1, 2), + arrayOf(3), + ) + globals["splitAt"] = tempArray + val excelSample = DatasetExtensionsTests::class.java.getResourceAsStream("sample.xlsx")!!.readAllBytes() val tempXlsx = tempfile(suffix = "xlsx").also { @@ -131,6 +164,84 @@ class DatasetExtensionsTests : JythonTest( } } } + + // https://www.sqlshack.com/sql-outer-join-overview-and-examples/ + context("Outer Join test") { + test("Outer Join") { + eval("utils.outerJoin(outerJoin1, outerJoin2, 0, 0)").asClue { + it.columnNames shouldBe listOf("EmpID", "EmpName", "City", "Designation", "EmpID", "Department_ID", "DepartmentName") + it.columnTypes shouldBe listOf( + Int::class.javaObjectType, + String::class.java, + String::class.java, + String::class.java, + Int::class.javaObjectType, + Int::class.javaObjectType, + String::class.java, + ) + it.getColumnAsList(6) shouldBe listOf("Executive", "Document Control", "Finance", "Engineering", "Facilities and Maintenance", "Finance", null, null, null, "Facilities and Maintenance") + it.rowCount shouldBe 10 + } + } + } + + context("Right Join test") { + test("Right Join") { + eval("utils.rightJoin(outerJoin1, outerJoin2, 0, 0)").asClue { + it.columnNames shouldBe listOf("EmpID", "EmpName", "City", "Designation", "EmpID", "Department_ID", "DepartmentName") + it.columnTypes shouldBe listOf( + Int::class.javaObjectType, + String::class.java, + String::class.java, + String::class.java, + Int::class.javaObjectType, + Int::class.javaObjectType, + String::class.java, + ) + it.getColumnAsList(5) shouldBe listOf(0, 1, 2, 3, 4, 2, 4) + it.rowCount shouldBe 7 + } + } + } + + context("Inner Join test") { + test("Inner Join") { + eval("utils.innerJoin(outerJoin1, outerJoin2, 0, 0)").asClue { + it.columnNames shouldBe listOf("EmpID", "EmpName", "City", "Designation", "EmpID", "Department_ID", "DepartmentName") + it.columnTypes shouldBe listOf( + Int::class.javaObjectType, + String::class.java, + String::class.java, + String::class.java, + Int::class.javaObjectType, + Int::class.javaObjectType, + String::class.java, + ) + it.getColumnAsList(5) shouldBe listOf(0, 1, 2, 3, 4, 2, 4) + it.rowCount shouldBe 7 + } + } + } + + context("Dataset Splitter") { + test("Split Dataset") { + eval>("utils.splitter(outerJoin1, splitAt)").asClue { + it.get(0).columnNames shouldBe listOf("EmpID", "EmpName", "City") + it.get(0).columnTypes shouldBe listOf( + Int::class.javaObjectType, + String::class.java, + String::class.java, + ) + it.get(0).rowCount shouldBe 10 + it.get(1).columnNames shouldBe listOf("Designation") + it.get(1).columnTypes shouldBe listOf( + String::class.java, + ) + it.get(1).rowCount shouldBe 10 + } + } + } + context("Filter tests") { test("Constant filter") { eval("utils.filter(dataset, lambda **kwargs: False)").asClue { From 7ec8fa73128bad9bb063900b4f99482af980bb41 Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Fri, 30 Jun 2023 12:25:16 -0400 Subject: [PATCH 20/24] Add description for leftJoin (cherry picked from commit 399777f1b6277f356d3bde53af3d56aac9f506e1) --- .../imdc/extensions/common/DatasetExtensions.properties | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties b/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties index b7a5530..e4306a7 100644 --- a/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties +++ b/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties @@ -4,6 +4,13 @@ map.param.mapper=A callable reference to invoke for each row. Will receive each map.param.preserveColumnTypes=True if the types of the output dataset should match the input. Otherwise, the output dataset will lose type information. map.returns=A modified dataset. +leftJoin.desc=Performs a left join on two datasets, returning a new dataset. +leftJoin.param.dataset=The first dataset. Must not be null. +leftJoin.param.dataset2=The second dataset. Must not be null. +leftJoin.param.columnIndex=Column index in the first dataset to join on. +leftJoin.param.columnIndex2=Column index in the second dataset to join on. +leftJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. + filter.desc=Runs a filtering function on each row in a dataset, returning a truncated dataset. filter.param.dataset=The dataset to filter. Must not be null. filter.param.filter=A function to run on each row. Will be called with keyword arguments matching column names. The first argument will be named 'row' and is the row index. Return True to keep the row in the output dataset. From edea2804eaac385c91727cf0e81b4e3485099005 Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Wed, 5 Jul 2023 12:39:09 -0400 Subject: [PATCH 21/24] Add splitter of dataset (cherry picked from commit 78538522f18c030595c693e9a4a4d0af6682a597) --- .../common/DatasetExtensions.properties | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties b/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties index e4306a7..f6eab57 100644 --- a/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties +++ b/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties @@ -11,6 +11,27 @@ leftJoin.param.columnIndex=Column index in the first dataset to join on. leftJoin.param.columnIndex2=Column index in the second dataset to join on. leftJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. +rightJoin.desc=Performs a right join on two datasets, returning a new dataset. +rightJoin.param.dataset=The first dataset. Must not be null. +rightJoin.param.dataset2=The second dataset. Must not be null. +rightJoin.param.columnIndex=Column index in the first dataset to join on. +rightJoin.param.columnIndex2=Column index in the second dataset to join on. +rightJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. + +innerJoin.desc=Performs a right join on two datasets, returning a new dataset. +innerJoin.param.dataset=The first dataset. Must not be null. +innerJoin.param.dataset2=The second dataset. Must not be null. +innerJoin.param.columnIndex=Column index in the first dataset to join on. +innerJoin.param.columnIndex2=Column index in the second dataset to join on. +innerJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. + +outerJoin.desc=Performs a right join on two datasets, returning a new dataset. +outerJoin.param.dataset=The first dataset. Must not be null. +outerJoin.param.dataset2=The second dataset. Must not be null. +outerJoin.param.columnIndex=Column index in the first dataset to join on. +outerJoin.param.columnIndex2=Column index in the second dataset to join on. +outerJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. + filter.desc=Runs a filtering function on each row in a dataset, returning a truncated dataset. filter.param.dataset=The dataset to filter. Must not be null. filter.param.filter=A function to run on each row. Will be called with keyword arguments matching column names. The first argument will be named 'row' and is the row index. Return True to keep the row in the output dataset. @@ -43,6 +64,11 @@ equals.param.dataset1=The first dataset. Must not be null. equals.param.dataset2=The second dataset. Must not be null. equals.returns=True if the two datasets have the same number of columns, with the same types, in the same order, with the same data in each row. +splitter.desc=Splits a dataset into any number of datasets, based on a list of columns. +splitter.param.dataset=Dataset to split. Must not be null. +splitter.param.columnsToSplit=List of columns that you would like to split dataset into +splitter.returns=List of datasets + valuesEqual.desc=Compares two datasets' content. valuesEqual.param.dataset1=The first dataset. Must not be null. valuesEqual.param.dataset2=The second dataset. Must not be null. From f98b789501f9f45f1beb0040c88b7f76c4678af6 Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Thu, 20 Jul 2023 16:22:25 -0400 Subject: [PATCH 22/24] Convert to PyObject --- .../extensions/common/DatasetExtensions.kt | 83 +++++++++++-------- .../common/DatasetExtensions.properties | 8 +- .../common/DatasetExtensionsTests.kt | 2 +- 3 files changed, 52 insertions(+), 41 deletions(-) diff --git a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt index 4c7425a..3575c7e 100644 --- a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt +++ b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt @@ -76,58 +76,71 @@ object DatasetExtensions { @Suppress("unused") @ScriptFunction(docBundlePrefix = "DatasetExtensions") @KeywordArgs( - names = ["dataset", "dataset2", "columnIndex", "columnIndex2"], - types = [Dataset::class, Dataset::class, Int::class, Int::class], + names = ["leftDataset", "rightDataset", "joinOn"], + types = [Dataset::class, Dataset::class, PyObject::class], ) fun leftJoin(args: Array, keywords: Array): Dataset? { val parsedArgs = PyArgParser.parseArgs( args, keywords, - arrayOf("dataset", "dataset2", "columnIndex", "columnIndex2"), - arrayOf(Dataset::class.java, Dataset::class.java, Int::class.java, Int::class.java), + arrayOf("leftDataset", "rightDataset", "joinOn"), + arrayOf(Dataset::class.java, Dataset::class.java, PyObject::class.java), "leftJoin", ) - val dataset = parsedArgs.requirePyObject("dataset").toJava() - val dataset2 = parsedArgs.requirePyObject("dataset2").toJava() - val column = parsedArgs.requirePyObject("columnIndex").toJava() - val column2 = parsedArgs.requirePyObject("columnIndex2").toJava() + val leftDataset = parsedArgs.requirePyObject("leftDataset").toJava() + val rightDataset = parsedArgs.requirePyObject("rightDataset").toJava() + val joinOn = parsedArgs.requirePyObject("joinOn") as PyFunction - val columnName = dataset.columnNames.toList() - val columnName2 = dataset2.columnNames.toList() - val combinedColumnName = columnName + columnName2 + val leftColumnNames = leftDataset.columnNames.toList() + val rightColumnNames = rightDataset.columnNames.toList() + val combinedColumnNames = leftColumnNames + rightColumnNames - val columnType = dataset.columnTypes.toList() - val columnType2 = dataset2.columnTypes.toList() - val combinedColumnType = columnType + columnType2 + val leftColumnTypes = leftDataset.columnTypes.toList() + val rightColumnTypes = rightDataset.columnTypes.toList() + val combinedColumnTypes = leftColumnTypes + rightColumnTypes val builder = DatasetBuilder.newBuilder() - .colNames(combinedColumnName) - .colTypes(combinedColumnType) - - for (row in dataset.rowIndices) { - val listToAppend = arrayOfNulls(combinedColumnName.size) - var row2: Int? = null - - dataset2.rowIndices.forEachIndexed { rowIndex, _ -> - if (dataset[row, column] == dataset2[rowIndex, column2]) { - row2 = rowIndex - return@forEachIndexed + .colNames(combinedColumnNames) + .colTypes(combinedColumnTypes) + + for (leftRow in leftDataset.rowIndices) { + var rowMatched = false // To track if a row from leftDataset has any matching rows in rightDataset + var matchingRow = -1 + for (rightRow in rightDataset.rowIndices) { + // Compare the values in each row of the left and right datasets based on the given columns + val leftRowValues = Array(leftDataset.columnCount) { col -> + leftDataset[leftRow, col] + } + val rightRowValues = Array(rightDataset.columnCount) { col -> + rightDataset[rightRow, col] } - } - dataset.columnIndices.forEachIndexed { colIndex, _ -> - listToAppend[colIndex] = dataset[row, colIndex] + rowMatched = joinOn.__call__(Py.java2py(leftRowValues), Py.java2py(rightRowValues)).toJava() + if (rowMatched) { + matchingRow = rightRow + break + } } - - row2?.let { r2 -> - dataset2.columnIndices.forEachIndexed { colIndex, _ -> - listToAppend[dataset.columnCount + colIndex] = dataset2[r2, colIndex] + if (!rowMatched) { + // If no match found for the left row, add it with null values for right dataset columns + val leftRowValues = Array(leftDataset.columnCount) { col -> + leftDataset[leftRow, col] + } + val rightRowValues = Array(rightDataset.columnCount) { null } + val totalArray = leftRowValues.toMutableList() + rightRowValues.toMutableList() + builder.addRow(*totalArray.toTypedArray()) + } else { + // If match found for the left row, add it with values for right dataset columns + val leftRowValues = Array(leftDataset.columnCount) { col -> + leftDataset[leftRow, col] + } + val rightRowValues = Array(rightDataset.columnCount) { col -> + rightDataset[matchingRow, col] } + val totalArray = leftRowValues.toMutableList() + rightRowValues.toMutableList() + builder.addRow(*totalArray.toTypedArray()) } - - builder.addRow(*listToAppend) } - return builder.build() } diff --git a/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties b/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties index f6eab57..9688191 100644 --- a/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties +++ b/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties @@ -5,11 +5,9 @@ map.param.preserveColumnTypes=True if the types of the output dataset should mat map.returns=A modified dataset. leftJoin.desc=Performs a left join on two datasets, returning a new dataset. -leftJoin.param.dataset=The first dataset. Must not be null. -leftJoin.param.dataset2=The second dataset. Must not be null. -leftJoin.param.columnIndex=Column index in the first dataset to join on. -leftJoin.param.columnIndex2=Column index in the second dataset to join on. -leftJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. +leftJoin.param.leftDataset=Left dataset. Must not be null. +leftJoin.param.rightDataset=Right dataset. Must not be null. +leftJoin.returns=Logic to join datasets on the specified columns of dataset1 and dataset2. rightJoin.desc=Performs a right join on two datasets, returning a new dataset. rightJoin.param.dataset=The first dataset. Must not be null. diff --git a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt index ebdbdca..90f2b44 100644 --- a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt +++ b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt @@ -146,7 +146,7 @@ class DatasetExtensionsTests : JythonTest( context("Left Join test") { test("Left Join") { - eval("utils.leftJoin(dataset, dataset2, 0, 0)").asClue { + eval("utils.leftJoin(dataset, dataset2, lambda d1, d2: (d1[0] == d2[0]))").asClue { it.columnNames shouldBe listOf("a", "b", "c", "a", "b2", "c2") it.columnTypes shouldBe listOf( Int::class.javaObjectType, From 9ab8ad25a75f874e513018d1f9ae3ff8d9e250cd Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Thu, 20 Jul 2023 16:33:55 -0400 Subject: [PATCH 23/24] Clean up --- .../extensions/common/DatasetExtensions.kt | 42 ++++++++----------- 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt index 3575c7e..82d5084 100644 --- a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt +++ b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt @@ -91,6 +91,9 @@ object DatasetExtensions { val rightDataset = parsedArgs.requirePyObject("rightDataset").toJava() val joinOn = parsedArgs.requirePyObject("joinOn") as PyFunction + val leftColumnCount = leftDataset.columnCount + val rightColumnCount = rightDataset.columnCount + val leftColumnNames = leftDataset.columnNames.toList() val rightColumnNames = rightDataset.columnNames.toList() val combinedColumnNames = leftColumnNames + rightColumnNames @@ -104,43 +107,34 @@ object DatasetExtensions { .colTypes(combinedColumnTypes) for (leftRow in leftDataset.rowIndices) { - var rowMatched = false // To track if a row from leftDataset has any matching rows in rightDataset var matchingRow = -1 + val leftRowValues = Array(leftColumnCount) { col -> + leftDataset[leftRow, col] + } + for (rightRow in rightDataset.rowIndices) { - // Compare the values in each row of the left and right datasets based on the given columns - val leftRowValues = Array(leftDataset.columnCount) { col -> - leftDataset[leftRow, col] - } - val rightRowValues = Array(rightDataset.columnCount) { col -> + val rightRowValues = Array(rightColumnCount) { col -> rightDataset[rightRow, col] } - rowMatched = joinOn.__call__(Py.java2py(leftRowValues), Py.java2py(rightRowValues)).toJava() - if (rowMatched) { + if (joinOn.__call__(Py.java2py(leftRowValues), Py.java2py(rightRowValues)).toJava()) { matchingRow = rightRow break } } - if (!rowMatched) { - // If no match found for the left row, add it with null values for right dataset columns - val leftRowValues = Array(leftDataset.columnCount) { col -> - leftDataset[leftRow, col] - } - val rightRowValues = Array(rightDataset.columnCount) { null } - val totalArray = leftRowValues.toMutableList() + rightRowValues.toMutableList() - builder.addRow(*totalArray.toTypedArray()) - } else { - // If match found for the left row, add it with values for right dataset columns - val leftRowValues = Array(leftDataset.columnCount) { col -> - leftDataset[leftRow, col] - } - val rightRowValues = Array(rightDataset.columnCount) { col -> + + val rightRowValues = if (matchingRow != -1) { + Array(rightColumnCount) { col -> rightDataset[matchingRow, col] } - val totalArray = leftRowValues.toMutableList() + rightRowValues.toMutableList() - builder.addRow(*totalArray.toTypedArray()) + } else { + Array(rightColumnCount) { null } } + + val totalArray = leftRowValues.toMutableList() + rightRowValues.toMutableList() + builder.addRow(*totalArray.toTypedArray()) } + return builder.build() } From d06742fa654ce8dc830d757222e0d339de2c12cc Mon Sep 17 00:00:00 2001 From: JosephTLockwood Date: Fri, 21 Jul 2023 09:15:45 -0400 Subject: [PATCH 24/24] Add right, left, inner, outer to joiner function. --- .../extensions/common/DatasetExtensions.kt | 276 +++++++++--------- .../common/DatasetExtensions.properties | 31 +- .../common/DatasetExtensionsTests.kt | 26 +- 3 files changed, 149 insertions(+), 184 deletions(-) diff --git a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt index 82d5084..0101176 100644 --- a/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt +++ b/common/src/main/kotlin/org/imdc/extensions/common/DatasetExtensions.kt @@ -76,21 +76,42 @@ object DatasetExtensions { @Suppress("unused") @ScriptFunction(docBundlePrefix = "DatasetExtensions") @KeywordArgs( - names = ["leftDataset", "rightDataset", "joinOn"], - types = [Dataset::class, Dataset::class, PyObject::class], + names = ["leftDataset", "rightDataset", "joinType", "joinOn"], + types = [Dataset::class, Dataset::class, String::class, PyObject::class], ) - fun leftJoin(args: Array, keywords: Array): Dataset? { + fun joiner(args: Array, keywords: Array): Dataset? { val parsedArgs = PyArgParser.parseArgs( args, keywords, - arrayOf("leftDataset", "rightDataset", "joinOn"), - arrayOf(Dataset::class.java, Dataset::class.java, PyObject::class.java), - "leftJoin", + arrayOf("leftDataset", "rightDataset", "joinType", "joinOn"), + arrayOf(Dataset::class.java, Dataset::class.java, String::class.java, PyObject::class.java), + "joiner", ) val leftDataset = parsedArgs.requirePyObject("leftDataset").toJava() val rightDataset = parsedArgs.requirePyObject("rightDataset").toJava() + val joinType = parsedArgs.requirePyObject("joinType").toJava() val joinOn = parsedArgs.requirePyObject("joinOn") as PyFunction + return when (joinType) { + "left" -> { + leftJoin(leftDataset, rightDataset, joinOn) + } + "right" -> { + rightJoin(leftDataset, rightDataset, joinOn) + } + "inner" -> { + innerJoin(leftDataset, rightDataset, joinOn) + } + "outer" -> { + outerJoin(leftDataset, rightDataset, joinOn) + } + else -> { + throw Py.ValueError("joinType must be one of 'left', 'right', 'inner', or 'outer'") + } + } + } + @Suppress("unused") + private fun leftJoin(leftDataset: Dataset, rightDataset: Dataset, joinOn: PyFunction): Dataset? { val leftColumnCount = leftDataset.columnCount val rightColumnCount = rightDataset.columnCount @@ -108,12 +129,12 @@ object DatasetExtensions { for (leftRow in leftDataset.rowIndices) { var matchingRow = -1 - val leftRowValues = Array(leftColumnCount) { col -> + val leftRowValues = Array(leftColumnCount) { col -> leftDataset[leftRow, col] } for (rightRow in rightDataset.rowIndices) { - val rightRowValues = Array(rightColumnCount) { col -> + val rightRowValues = Array(rightColumnCount) { col -> rightDataset[rightRow, col] } @@ -124,7 +145,7 @@ object DatasetExtensions { } val rightRowValues = if (matchingRow != -1) { - Array(rightColumnCount) { col -> + Array(rightColumnCount) { col -> rightDataset[matchingRow, col] } } else { @@ -138,194 +159,157 @@ object DatasetExtensions { return builder.build() } - @Suppress("unused") - @ScriptFunction(docBundlePrefix = "DatasetExtensions") - @KeywordArgs( - names = ["dataset", "dataset2", "columnIndex", "columnIndex2"], - types = [Dataset::class, Dataset::class, Int::class, Int::class], - ) - fun innerJoin(args: Array, keywords: Array): Dataset? { - val parsedArgs = PyArgParser.parseArgs( - args, - keywords, - arrayOf("dataset", "dataset2", "columnIndex", "columnIndex2"), - arrayOf(Dataset::class.java, Dataset::class.java, Int::class.java, Int::class.java), - "innerJoin", - ) - val dataset = parsedArgs.requirePyObject("dataset").toJava() - val dataset2 = parsedArgs.requirePyObject("dataset2").toJava() - val column = parsedArgs.requirePyObject("columnIndex").toJava() - val column2 = parsedArgs.requirePyObject("columnIndex2").toJava() + private fun rightJoin(leftDataset: Dataset, rightDataset: Dataset, joinOn: PyFunction): Dataset? { + val leftColumnCount = leftDataset.columnCount + val rightColumnCount = rightDataset.columnCount - val columnName = dataset.columnNames.toList() - val columnName2 = dataset2.columnNames.toList() - val combinedColumnName = columnName + columnName2 + val leftColumnNames = leftDataset.columnNames.toList() + val rightColumnNames = rightDataset.columnNames.toList() + val combinedColumnNames = leftColumnNames + rightColumnNames - val columnType = dataset.columnTypes.toList() - val columnType2 = dataset2.columnTypes.toList() - val combinedColumnType = columnType + columnType2 + val leftColumnTypes = leftDataset.columnTypes.toList() + val rightColumnTypes = rightDataset.columnTypes.toList() + val combinedColumnTypes = leftColumnTypes + rightColumnTypes val builder = DatasetBuilder.newBuilder() - .colNames(combinedColumnName) - .colTypes(combinedColumnType) - - for (row in dataset.rowIndices) { - val listToAppend = arrayOfNulls(combinedColumnName.size) - var row2: Int? = null + .colNames(combinedColumnNames) + .colTypes(combinedColumnTypes) - dataset2.rowIndices.forEachIndexed { rowIndex, _ -> - if (dataset[row, column] == dataset2[rowIndex, column2]) { - row2 = rowIndex - return@forEachIndexed - } + for (rightRow in rightDataset.rowIndices) { + var matchingRow = -1 + val rightRowValues = Array(rightColumnCount) { col -> + rightDataset[rightRow, col] } - if (row2 != null) { - dataset.columnIndices.forEachIndexed { colIndex, _ -> - listToAppend[colIndex] = dataset[row, colIndex] + for (leftRow in leftDataset.rowIndices) { + val leftRowValues = Array(leftColumnCount) { col -> + leftDataset[leftRow, col] } - dataset2.columnIndices.forEachIndexed { colIndex, _ -> - listToAppend[dataset.columnCount + colIndex] = dataset2[row2!!, colIndex] + if (joinOn.__call__(Py.java2py(leftRowValues), Py.java2py(rightRowValues)).toJava()) { + matchingRow = rightRow + break } + } - builder.addRow(*listToAppend) + val leftRowValues = if (matchingRow != -1) { + Array(leftColumnCount) { col -> + leftDataset[matchingRow, col] + } + } else { + Array(leftColumnCount) { null } } + + val totalArray = leftRowValues.toMutableList() + rightRowValues.toMutableList() + builder.addRow(*totalArray.toTypedArray()) } return builder.build() } - @Suppress("unused") - @ScriptFunction(docBundlePrefix = "DatasetExtensions") - @KeywordArgs( - names = ["dataset", "dataset2", "columnIndex", "columnIndex2"], - types = [Dataset::class, Dataset::class, Int::class, Int::class], - ) - fun rightJoin(args: Array, keywords: Array): Dataset? { - val parsedArgs = PyArgParser.parseArgs( - args, - keywords, - arrayOf("dataset", "dataset2", "columnIndex", "columnIndex2"), - arrayOf(Dataset::class.java, Dataset::class.java, Int::class.java, Int::class.java), - "rightJoin", - ) - val dataset = parsedArgs.requirePyObject("dataset").toJava() - val dataset2 = parsedArgs.requirePyObject("dataset2").toJava() - val column = parsedArgs.requirePyObject("columnIndex").toJava() - val column2 = parsedArgs.requirePyObject("columnIndex2").toJava() + private fun innerJoin(leftDataset: Dataset, rightDataset: Dataset, joinOn: PyFunction): Dataset? { + val leftColumnCount = leftDataset.columnCount + val rightColumnCount = rightDataset.columnCount - val columnName = dataset.columnNames.toList() - val columnName2 = dataset2.columnNames.toList() - val combinedColumnName = columnName + columnName2 + val leftColumnNames = leftDataset.columnNames.toList() + val rightColumnNames = rightDataset.columnNames.toList() + val combinedColumnNames = leftColumnNames + rightColumnNames - val columnType = dataset.columnTypes.toList() - val columnType2 = dataset2.columnTypes.toList() - val combinedColumnType = columnType + columnType2 + val leftColumnTypes = leftDataset.columnTypes.toList() + val rightColumnTypes = rightDataset.columnTypes.toList() + val combinedColumnTypes = leftColumnTypes + rightColumnTypes val builder = DatasetBuilder.newBuilder() - .colNames(combinedColumnName) - .colTypes(combinedColumnType) - - for (row in dataset2.rowIndices) { - val listToAppend = arrayOfNulls(combinedColumnName.size) - var row2: Int? = null + .colNames(combinedColumnNames) + .colTypes(combinedColumnTypes) - dataset.rowIndices.forEachIndexed { rowIndex, _ -> - if (dataset[rowIndex, column] == dataset2[row, column2]) { - row2 = rowIndex - return@forEachIndexed - } + for (leftRow in leftDataset.rowIndices) { + val leftRowValues = Array(leftColumnCount) { col -> + leftDataset[leftRow, col] } - if (row2 != null) { - dataset.columnIndices.forEachIndexed { colIndex, _ -> - listToAppend[colIndex] = dataset[row2!!, colIndex] + for (rightRow in rightDataset.rowIndices) { + val rightRowValues = Array(rightColumnCount) { col -> + rightDataset[rightRow, col] } - dataset2.columnIndices.forEachIndexed { colIndex, _ -> - listToAppend[dataset.columnCount + colIndex] = dataset2[row, colIndex] + if (joinOn.__call__(Py.java2py(leftRowValues), Py.java2py(rightRowValues)).toJava()) { + val totalArray = leftRowValues.toMutableList() + rightRowValues.toMutableList() + builder.addRow(*totalArray.toTypedArray()) } - - builder.addRow(*listToAppend) } } return builder.build() } - @Suppress("unused") - @ScriptFunction(docBundlePrefix = "DatasetExtensions") - @KeywordArgs( - names = ["dataset", "dataset2", "columnIndex", "columnIndex2"], - types = [Dataset::class, Dataset::class, Int::class, Int::class], - ) - fun outerJoin(args: Array, keywords: Array): Dataset? { - val parsedArgs = PyArgParser.parseArgs( - args, - keywords, - arrayOf("dataset", "dataset2", "columnIndex", "columnIndex2"), - arrayOf(Dataset::class.java, Dataset::class.java, Int::class.java, Int::class.java), - "outerJoin", - ) - val dataset = parsedArgs.requirePyObject("dataset").toJava() - val dataset2 = parsedArgs.requirePyObject("dataset2").toJava() - val column = parsedArgs.requirePyObject("columnIndex").toJava() - val column2 = parsedArgs.requirePyObject("columnIndex2").toJava() + private fun outerJoin(leftDataset: Dataset, rightDataset: Dataset, joinOn: PyFunction): Dataset? { + val leftColumnCount = leftDataset.columnCount + val rightColumnCount = rightDataset.columnCount - val columnName = dataset.columnNames.toList() - val columnName2 = dataset2.columnNames.toList() - val combinedColumnName = columnName + columnName2 + val leftColumnNames = leftDataset.columnNames.toList() + val rightColumnNames = rightDataset.columnNames.toList() + val combinedColumnNames = leftColumnNames + rightColumnNames - val columnType = dataset.columnTypes.toList() - val columnType2 = dataset2.columnTypes.toList() - val combinedColumnType = columnType + columnType2 + val leftColumnTypes = leftDataset.columnTypes.toList() + val rightColumnTypes = rightDataset.columnTypes.toList() + val combinedColumnTypes = leftColumnTypes + rightColumnTypes val builder = DatasetBuilder.newBuilder() - .colNames(combinedColumnName) - .colTypes(combinedColumnType) + .colNames(combinedColumnNames) + .colTypes(combinedColumnTypes) - for (row in dataset.rowIndices) { - val listToAppend = arrayOfNulls(combinedColumnName.size) - var row2: Int? = null + for (leftRow in leftDataset.rowIndices) { + val leftRowValues = Array(leftColumnCount) { col -> + leftDataset[leftRow, col] + } + + var matched = false - dataset2.rowIndices.forEachIndexed { rowIndex, _ -> - if (dataset[row, column] == dataset2[rowIndex, column2]) { - row2 = rowIndex - return@forEachIndexed + for (rightRow in rightDataset.rowIndices) { + val rightRowValues = Array(rightColumnCount) { col -> + rightDataset[rightRow, col] + } + + if (joinOn.__call__(Py.java2py(leftRowValues), Py.java2py(rightRowValues)).toJava()) { + matched = true + val totalArray = leftRowValues.toMutableList() + rightRowValues.toMutableList() + builder.addRow(*totalArray.toTypedArray()) } } - dataset.columnIndices.forEachIndexed { colIndex, _ -> - listToAppend[colIndex] = dataset[row, colIndex] + if (!matched) { + // If no match found for the left row, add null values for the right dataset columns + val rightRowValues = Array(rightColumnCount) { null } + val totalArray = leftRowValues.toMutableList() + rightRowValues.toMutableList() + builder.addRow(*totalArray.toTypedArray()) } + } - row2?.let { r2 -> - dataset2.columnIndices.forEachIndexed { colIndex, _ -> - listToAppend[dataset.columnCount + colIndex] = dataset2[r2, colIndex] - } + // Add rows from the right dataset that don't have a match in the left dataset + for (rightRow in rightDataset.rowIndices) { + val rightRowValues = Array(rightColumnCount) { col -> + rightDataset[rightRow, col] } - builder.addRow(*listToAppend) - } + var matched = false - // Add unmatched rows from dataset2 - for (row2 in dataset2.rowIndices) { - val listToAppend = arrayOfNulls(combinedColumnName.size) - var row1: Int? = null + for (leftRow in leftDataset.rowIndices) { + val leftRowValues = Array(leftColumnCount) { col -> + leftDataset[leftRow, col] + } - dataset.rowIndices.forEachIndexed { rowIndex, _ -> - if (dataset[rowIndex, column] == dataset2[row2, column2]) { - row1 = rowIndex - return@forEachIndexed + if (joinOn.__call__(Py.java2py(leftRowValues), Py.java2py(rightRowValues)).toJava()) { + matched = true + break } } - if (row1 == null) { - dataset2.columnIndices.forEachIndexed { colIndex, _ -> - listToAppend[dataset.columnCount + colIndex] = dataset2[row2, colIndex] - } - builder.addRow(*listToAppend) + if (!matched) { + // If no match found for the right row, add null values for the left dataset columns + val leftRowValues = Array(leftColumnCount) { null } + val totalArray = leftRowValues.toMutableList() + rightRowValues.toMutableList() + builder.addRow(*totalArray.toTypedArray()) } } diff --git a/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties b/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties index 9688191..7499a28 100644 --- a/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties +++ b/common/src/main/resources/org/imdc/extensions/common/DatasetExtensions.properties @@ -4,31 +4,12 @@ map.param.mapper=A callable reference to invoke for each row. Will receive each map.param.preserveColumnTypes=True if the types of the output dataset should match the input. Otherwise, the output dataset will lose type information. map.returns=A modified dataset. -leftJoin.desc=Performs a left join on two datasets, returning a new dataset. -leftJoin.param.leftDataset=Left dataset. Must not be null. -leftJoin.param.rightDataset=Right dataset. Must not be null. -leftJoin.returns=Logic to join datasets on the specified columns of dataset1 and dataset2. - -rightJoin.desc=Performs a right join on two datasets, returning a new dataset. -rightJoin.param.dataset=The first dataset. Must not be null. -rightJoin.param.dataset2=The second dataset. Must not be null. -rightJoin.param.columnIndex=Column index in the first dataset to join on. -rightJoin.param.columnIndex2=Column index in the second dataset to join on. -rightJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. - -innerJoin.desc=Performs a right join on two datasets, returning a new dataset. -innerJoin.param.dataset=The first dataset. Must not be null. -innerJoin.param.dataset2=The second dataset. Must not be null. -innerJoin.param.columnIndex=Column index in the first dataset to join on. -innerJoin.param.columnIndex2=Column index in the second dataset to join on. -innerJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. - -outerJoin.desc=Performs a right join on two datasets, returning a new dataset. -outerJoin.param.dataset=The first dataset. Must not be null. -outerJoin.param.dataset2=The second dataset. Must not be null. -outerJoin.param.columnIndex=Column index in the first dataset to join on. -outerJoin.param.columnIndex2=Column index in the second dataset to join on. -outerJoin.returns=A new dataset joined on the specified columns of dataset1 and dataset2. +joiner.desc=Joins two datasets together, based on a list of columns. +joiner.param.leftDataset=The left dataset. Must not be null. +joiner.param.rightDataset=The right dataset. Must not be null. +joiner.param.joinType=The type of join to perform. Must be one of "inner", "left", "right", or "outer". Must not be null. +joiner.param.joinOn=Expression to join on. Must use column indexes. Must not be null. +joiner.return=The joined dataset. filter.desc=Runs a filtering function on each row in a dataset, returning a truncated dataset. filter.param.dataset=The dataset to filter. Must not be null. diff --git a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt index 90f2b44..cdd7af3 100644 --- a/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt +++ b/common/src/test/kotlin/org/imdc/extensions/common/DatasetExtensionsTests.kt @@ -146,7 +146,7 @@ class DatasetExtensionsTests : JythonTest( context("Left Join test") { test("Left Join") { - eval("utils.leftJoin(dataset, dataset2, lambda d1, d2: (d1[0] == d2[0]))").asClue { + eval("utils.joiner(dataset, dataset2, 'left', lambda d1, d2: (d1[0] == d2[0]))").asClue { it.columnNames shouldBe listOf("a", "b", "c", "a", "b2", "c2") it.columnTypes shouldBe listOf( Int::class.javaObjectType, @@ -167,10 +167,9 @@ class DatasetExtensionsTests : JythonTest( } } - // https://www.sqlshack.com/sql-outer-join-overview-and-examples/ - context("Outer Join test") { - test("Outer Join") { - eval("utils.outerJoin(outerJoin1, outerJoin2, 0, 0)").asClue { + context("Right Join test") { + test("Right Join") { + eval("utils.joiner(outerJoin1, outerJoin2, 'right', lambda d1, d2: (d1[0] == d2[0]))").asClue { it.columnNames shouldBe listOf("EmpID", "EmpName", "City", "Designation", "EmpID", "Department_ID", "DepartmentName") it.columnTypes shouldBe listOf( Int::class.javaObjectType, @@ -181,15 +180,16 @@ class DatasetExtensionsTests : JythonTest( Int::class.javaObjectType, String::class.java, ) - it.getColumnAsList(6) shouldBe listOf("Executive", "Document Control", "Finance", "Engineering", "Facilities and Maintenance", "Finance", null, null, null, "Facilities and Maintenance") - it.rowCount shouldBe 10 + it.getColumnAsList(5) shouldBe listOf(0, 1, 2, 3, 4, 2, 4) + it.rowCount shouldBe 7 } } } - context("Right Join test") { - test("Right Join") { - eval("utils.rightJoin(outerJoin1, outerJoin2, 0, 0)").asClue { + // https://www.sqlshack.com/sql-outer-join-overview-and-examples/ + context("Outer Join test") { + test("Outer Join") { + eval("utils.joiner(outerJoin1, outerJoin2, 'outer', lambda d1, d2: (d1[0] == d2[0]))").asClue { it.columnNames shouldBe listOf("EmpID", "EmpName", "City", "Designation", "EmpID", "Department_ID", "DepartmentName") it.columnTypes shouldBe listOf( Int::class.javaObjectType, @@ -200,15 +200,15 @@ class DatasetExtensionsTests : JythonTest( Int::class.javaObjectType, String::class.java, ) - it.getColumnAsList(5) shouldBe listOf(0, 1, 2, 3, 4, 2, 4) - it.rowCount shouldBe 7 + it.getColumnAsList(6) shouldBe listOf("Executive", "Document Control", "Finance", "Engineering", "Facilities and Maintenance", "Finance", null, null, null, "Facilities and Maintenance") + it.rowCount shouldBe 10 } } } context("Inner Join test") { test("Inner Join") { - eval("utils.innerJoin(outerJoin1, outerJoin2, 0, 0)").asClue { + eval("utils.joiner(outerJoin1, outerJoin2, 'inner', lambda d1, d2: (d1[0] == d2[0]))").asClue { it.columnNames shouldBe listOf("EmpID", "EmpName", "City", "Designation", "EmpID", "Department_ID", "DepartmentName") it.columnTypes shouldBe listOf( Int::class.javaObjectType,