From c34e46116e1298b390715965e3aeba71cc08124f Mon Sep 17 00:00:00 2001 From: Ghislain Fourny Date: Thu, 21 Aug 2025 14:56:09 +0200 Subject: [PATCH 1/4] Create Spark 3.5 release. --- .github/workflows/maven.yml | 8 +- build_antlr_parser.xml | 2 +- build_antlr_parser_xquery.xml | 2 +- pom.xml | 90 +++++++++---------- src/main/java/org/rumbledb/api/Rumble.java | 1 - src/main/java/org/rumbledb/cli/Main.java | 11 +-- .../org/rumbledb/items/Base64BinaryItem.java | 5 +- .../runtime/flwor/FlworDataFrameUtils.java | 4 +- .../flwor/clauses/LetClauseSparkIterator.java | 4 +- ...upClauseArrayMergeAggregateResultsUDF.java | 1 - .../navigation/ArrayLookupIterator.java | 2 +- .../ml/ApplyEstimatorRuntimeIterator.java | 23 ----- src/test/java/iq/DeltaUpdateRuntimeTests.java | 1 - src/test/java/iq/RuntimeTests.java | 4 - .../java/iq/UpdatesForRumbleBenchmark.java | 1 - 15 files changed, 58 insertions(+), 101 deletions(-) diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml index 07e2ba79e5..fb4a61e8d1 100644 --- a/.github/workflows/maven.yml +++ b/.github/workflows/maven.yml @@ -11,10 +11,10 @@ jobs: continue-on-error: true steps: - uses: actions/checkout@v3 - - name: Set up Java 17 + - name: Set up Java 11 uses: actions/setup-java@v3 with: - java-version: 17 + java-version: 11 distribution: adopt # Github ubuntu contain Maven, but other testing software use minimal size container - name: Check if Maven is Installed @@ -74,10 +74,10 @@ jobs: ] steps: - uses: actions/checkout@v3 - - name: Set up Java 17 + - name: Set up Java 11 uses: actions/setup-java@v3 with: - java-version: 17 + java-version: 11 distribution: adopt - name: Restore Cached Maven Packages uses: actions/cache@v3 diff --git a/build_antlr_parser.xml b/build_antlr_parser.xml index 8078e540d5..498ad99b84 100644 --- a/build_antlr_parser.xml +++ b/build_antlr_parser.xml @@ -25,7 +25,7 @@ - + diff --git a/build_antlr_parser_xquery.xml b/build_antlr_parser_xquery.xml index 9e9f419cd9..f9fb303fda 100644 --- a/build_antlr_parser_xquery.xml +++ b/build_antlr_parser_xquery.xml @@ -25,7 +25,7 @@ - + diff --git a/pom.xml b/pom.xml index 3542150f56..e5df3cda32 100644 --- a/pom.xml +++ b/pom.xml @@ -41,12 +41,12 @@ org.apache.maven.plugins maven-compiler-plugin - 3.13.0 + 3.8.1 eclipse - 17 - 17 + 11 + 11 -properties ${project.basedir}/org.eclipse.jdt.core.prefs @@ -56,12 +56,12 @@ org.codehaus.plexus plexus-compiler-eclipse - 2.13.0 + 2.8.5 org.eclipse.jdt ecj - 3.36.0 + 3.20.0 @@ -76,7 +76,7 @@ org.apache.maven.plugins maven-jar-plugin - 3.4.2 + 3.1.2 @@ -117,7 +117,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.7.0 + 3.1.1 org.rumbledb.cli:org.rumbledb.config:sparksoniq.*:sparksoniq @@ -235,7 +235,7 @@ org.apache.maven.plugins maven-surefire-plugin - 3.3.0 + 3.3.1 ${surefire.enableOutErrElements} junit @@ -338,7 +337,7 @@ org.apache.commons commons-lang3 - 3.14.0 + 3.12.0 commons-net @@ -350,43 +349,38 @@ commons-io 2.18.0 - - org.apache.httpcomponents - httpclient - 4.5.13 - - - - org.jgrapht - jgrapht-core - 1.4.0 - - - joda-time - joda-time - 2.10.6 - - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml - 2.17.1 - - - io.delta - delta-spark_2.13 - 4.0.0 - + + org.apache.httpcomponents + httpclient + 4.5.13 + org.skyscreamer jsonassert 1.5.1 test - + + + org.jgrapht + jgrapht-core + 1.4.0 + + + com.fasterxml.jackson.dataformat + jackson-dataformat-yaml + 2.15.2 + + + io.delta + delta-spark_2.13 + 3.2.1 + + diff --git a/src/main/java/org/rumbledb/api/Rumble.java b/src/main/java/org/rumbledb/api/Rumble.java index 5915230045..dbac5e44aa 100644 --- a/src/main/java/org/rumbledb/api/Rumble.java +++ b/src/main/java/org/rumbledb/api/Rumble.java @@ -5,7 +5,6 @@ import org.rumbledb.context.DynamicContext; import org.rumbledb.expressions.module.MainModule; import org.rumbledb.runtime.RuntimeIterator; - import sparksoniq.spark.SparkSessionManager; import java.io.IOException; diff --git a/src/main/java/org/rumbledb/cli/Main.java b/src/main/java/org/rumbledb/cli/Main.java index 98504198d3..2a0e84e5ed 100644 --- a/src/main/java/org/rumbledb/cli/Main.java +++ b/src/main/java/org/rumbledb/cli/Main.java @@ -37,15 +37,12 @@ public class Main { public static void main(String[] args) throws IOException { String javaVersion = System.getProperty("java.version"); - if ( - !javaVersion.startsWith("17") - && !javaVersion.startsWith("21") - ) { - System.err.println("[Error] RumbleDB requires Java 17 or 21 (17 being the default Spark 4 version)."); + if (!javaVersion.startsWith("11.") && !javaVersion.startsWith("17.")) { + System.err.println("[Error] RumbleDB requires Java 11 or 17."); System.err.println("Your Java version: " + System.getProperty("java.version")); - System.err.println("You can download Java 17 or 21 from https://adoptium.net/"); + System.err.println("You can download Java 11 or 17 from https://adoptium.net/"); System.err.println( - "If you do have Java 17 or 21, but the wrong version appears above, then it means you need to set your JAVA_HOME environment variable properly to point to Java 17 or 21." + "If you do have Java 11 or 17, but the wrong version appears above, then it means you need to set your JAVA_HOME environment variable properly to point to Java 11 or 17." ); System.exit(43); } diff --git a/src/main/java/org/rumbledb/items/Base64BinaryItem.java b/src/main/java/org/rumbledb/items/Base64BinaryItem.java index dcbecf7f1d..19d3db9a1f 100644 --- a/src/main/java/org/rumbledb/items/Base64BinaryItem.java +++ b/src/main/java/org/rumbledb/items/Base64BinaryItem.java @@ -12,7 +12,7 @@ import org.rumbledb.runtime.misc.ComparisonIterator; import org.rumbledb.types.ItemType; -import java.util.Base64; +import javax.xml.bind.DatatypeConverter; import java.util.Arrays; import java.util.regex.Pattern; @@ -41,7 +41,6 @@ public Base64BinaryItem() { } public Base64BinaryItem(String stringValue) { - stringValue = stringValue.replaceAll("\\s", ""); this.value = parseBase64BinaryString(stringValue); this.stringValue = StringUtils.chomp(Base64.getEncoder().encodeToString(this.value)); } @@ -87,7 +86,7 @@ static byte[] parseBase64BinaryString(String base64BinaryString) throws IllegalA if (base64BinaryString == null || !checkInvalidBase64BinaryFormat(base64BinaryString.replaceAll("\\s", ""))) { throw new IllegalArgumentException(); } - return Base64.getDecoder().decode(base64BinaryString); + return DatatypeConverter.parseBase64Binary(base64BinaryString); } @Override diff --git a/src/main/java/org/rumbledb/runtime/flwor/FlworDataFrameUtils.java b/src/main/java/org/rumbledb/runtime/flwor/FlworDataFrameUtils.java index 1b3c24439e..4607d3925c 100644 --- a/src/main/java/org/rumbledb/runtime/flwor/FlworDataFrameUtils.java +++ b/src/main/java/org/rumbledb/runtime/flwor/FlworDataFrameUtils.java @@ -90,8 +90,6 @@ import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; -import scala.collection.immutable.ArraySeq; -import scala.collection.Iterator; import scala.collection.immutable.ArraySeq; import scala.collection.Iterator; import sparksoniq.spark.SparkSessionManager; @@ -832,7 +830,7 @@ public static void deserializeWrappedParameters( continue; } @SuppressWarnings("unchecked") - List deserializedParam = (List) deserializeByteArray(bytes, kryo, input); + List deserializedParam = (List) deserializeByteArray((byte[]) bytes, kryo, input); deserializedParams.add(deserializedParam); } } diff --git a/src/main/java/org/rumbledb/runtime/flwor/clauses/LetClauseSparkIterator.java b/src/main/java/org/rumbledb/runtime/flwor/clauses/LetClauseSparkIterator.java index 343a72c5a5..fe2e8189cd 100644 --- a/src/main/java/org/rumbledb/runtime/flwor/clauses/LetClauseSparkIterator.java +++ b/src/main/java/org/rumbledb/runtime/flwor/clauses/LetClauseSparkIterator.java @@ -276,7 +276,6 @@ public FlworDataFrame getDataFrameAsJoin( ); } } - if (rightDependencies.size() == 1 && rightDependencies.contains(Name.CONTEXT_ITEM)) { if (!leftDependencies.contains(Name.CONTEXT_ITEM)) { contextItemValueExpression = rightHandSideOfJoinEqualityCriterion; @@ -294,7 +293,6 @@ public FlworDataFrame getDataFrameAsJoin( getMetadata() ); } - // Now we know we can execute the query as an equi-join. // First, we evaluate all input tuples. Dataset inputDF = this.child.getDataFrame(context).getDataFrame(); @@ -328,6 +326,8 @@ public FlworDataFrame getDataFrameAsJoin( sequenceDependencies ).getDataFrame(); + LogManager.getLogger("LetClauseSparkIterator").info("Rumble detected an equi-join in the left clause."); + // We compute the hashes for both sides of the equality predicate. expressionDF = LetClauseSparkIterator.bindLetVariableInDataFrame( expressionDF, diff --git a/src/main/java/org/rumbledb/runtime/flwor/udfs/GroupClauseArrayMergeAggregateResultsUDF.java b/src/main/java/org/rumbledb/runtime/flwor/udfs/GroupClauseArrayMergeAggregateResultsUDF.java index 9506df407c..f575537fc1 100644 --- a/src/main/java/org/rumbledb/runtime/flwor/udfs/GroupClauseArrayMergeAggregateResultsUDF.java +++ b/src/main/java/org/rumbledb/runtime/flwor/udfs/GroupClauseArrayMergeAggregateResultsUDF.java @@ -29,7 +29,6 @@ import scala.collection.Iterator; import java.util.ArrayList; -// import java.util.Iterator; import java.util.List; public class GroupClauseArrayMergeAggregateResultsUDF implements UDF1, Object[]> { diff --git a/src/main/java/org/rumbledb/runtime/navigation/ArrayLookupIterator.java b/src/main/java/org/rumbledb/runtime/navigation/ArrayLookupIterator.java index f1f1054127..dc442ef476 100644 --- a/src/main/java/org/rumbledb/runtime/navigation/ArrayLookupIterator.java +++ b/src/main/java/org/rumbledb/runtime/navigation/ArrayLookupIterator.java @@ -231,7 +231,7 @@ public NativeClauseContext generateNativeQuery(NativeClauseContext nativeClauseC ) ); newContext.setSchema(((ArrayType) newContext.getSchema()).elementType()); - newContext.setResultingQuery("get(" + newContext.getResultingQuery() + " ," + (this.lookup - 1) + ")"); + newContext.setResultingQuery(newContext.getResultingQuery() + "[" + (this.lookup - 1) + "]"); } return newContext; } diff --git a/src/main/java/sparksoniq/spark/ml/ApplyEstimatorRuntimeIterator.java b/src/main/java/sparksoniq/spark/ml/ApplyEstimatorRuntimeIterator.java index 833731b26c..f6b655db4b 100644 --- a/src/main/java/sparksoniq/spark/ml/ApplyEstimatorRuntimeIterator.java +++ b/src/main/java/sparksoniq/spark/ml/ApplyEstimatorRuntimeIterator.java @@ -81,29 +81,6 @@ public Item materializeFirstItemOrNull( fittedModel = this.estimator.fit(this.inputDataset.getDataFrame(), paramMap); } catch (IllegalArgumentException | NoSuchElementException e) { String message = e.getMessage(); - if (message == null) { - System.err.println("Exception stack trace:"); - e.printStackTrace(); - RumbleException ex = new InvalidRumbleMLParamException( - "Parameters provided to " - + this.estimatorShortName - + " caused an error with no message." - + "Exception class: " - + e.getClass().getName() - + "\n" - + "\n\nWe are happy to give you a few hints:" - + "\nBy default, we look for the features used to train the model in the field 'features'." - + "\nIf this field does not exist, you can build it with the VectorAssembler transformer by combining the fields you want to include." - + "\n\nFor example:" - + "\nlet $vector-assembler := get-transformer(\"VectorAssembler\")" - + "\nlet $data := $vector-assembler($data, {\"inputCols\" : [ \"age\", \"weight\" ], \"outputCol\" : \"features\" })" - + "\n\nIf the features are in your data, but in a different field than 'features', you can specify that different field name with the parameter 'featuresCol' or 'inputCol' (check the documentation of the estimator to be sure) passed to your estimator." - + "\n\nIf the error says that it must be of the type struct,values:array> but was actually something different, then it means you specified a field that is not an assembled features array. You need to use the VectorAssembler to prepare it.", - getMetadata() - ); - ex.initCause(e); - throw ex; - } Pattern pattern = Pattern.compile("(.* ]) does not exist. Available: (.*)"); Matcher matcher = pattern.matcher(message); if (matcher.find()) { diff --git a/src/test/java/iq/DeltaUpdateRuntimeTests.java b/src/test/java/iq/DeltaUpdateRuntimeTests.java index df248c40a6..a5c932fabd 100644 --- a/src/test/java/iq/DeltaUpdateRuntimeTests.java +++ b/src/test/java/iq/DeltaUpdateRuntimeTests.java @@ -33,7 +33,6 @@ import org.junit.runners.Parameterized; import org.rumbledb.api.Item; import org.rumbledb.api.SequenceOfItems; -import scala.Function0; import scala.util.Properties; import scala.Function0; import sparksoniq.spark.SparkSessionManager; diff --git a/src/test/java/iq/RuntimeTests.java b/src/test/java/iq/RuntimeTests.java index 23cac550eb..47d432fe3a 100644 --- a/src/test/java/iq/RuntimeTests.java +++ b/src/test/java/iq/RuntimeTests.java @@ -21,9 +21,6 @@ package iq; import iq.base.AnnotationsTestsBase; -import scala.Function0; -import scala.util.Properties; - import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.junit.Assert; @@ -115,7 +112,6 @@ public static void setupSparkSession() { System.err.println("Scala version: " + scalaVersion); SparkConf sparkConfiguration = new SparkConf(); sparkConfiguration.setMaster("local[*]"); - sparkConfiguration.set("spark.sql.adaptive.enabled", "false"); sparkConfiguration.set("spark.submit.deployMode", "client"); sparkConfiguration.set("spark.executor.extraClassPath", "lib/"); sparkConfiguration.set("spark.driver.extraClassPath", "lib/"); diff --git a/src/test/java/iq/UpdatesForRumbleBenchmark.java b/src/test/java/iq/UpdatesForRumbleBenchmark.java index 3508834ccf..9cf71d1191 100644 --- a/src/test/java/iq/UpdatesForRumbleBenchmark.java +++ b/src/test/java/iq/UpdatesForRumbleBenchmark.java @@ -11,7 +11,6 @@ import org.rumbledb.config.RumbleRuntimeConfiguration; import org.rumbledb.exceptions.ExceptionMetadata; import org.rumbledb.runtime.functions.input.FileSystemUtil; -import scala.Function0; import scala.util.Properties; import scala.Function0; import sparksoniq.spark.SparkSessionManager; From f620ba77b6e19a0e1578289aa65d904dc6bd8fbf Mon Sep 17 00:00:00 2001 From: Ghislain Fourny Date: Thu, 21 Aug 2025 16:25:56 +0200 Subject: [PATCH 2/4] Fix pom. --- pom.xml | 51 +++++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/pom.xml b/pom.xml index 197135d9c5..d97a735bd8 100644 --- a/pom.xml +++ b/pom.xml @@ -349,38 +349,37 @@ commons-io 2.18.0 - - org.apache.httpcomponents - httpclient - 4.5.13 - + + org.apache.httpcomponents + httpclient + 4.5.13 + org.skyscreamer jsonassert 1.5.1 test - - - org.jgrapht - jgrapht-core - 1.4.0 - - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml - 2.15.2 - - - io.delta - delta-spark_2.13 - 3.2.1 - - + + + org.jgrapht + jgrapht-core + 1.4.0 + + + com.fasterxml.jackson.dataformat + jackson-dataformat-yaml + 2.15.2 + + + io.delta + delta-spark_2.13 + 3.2.1 + net.sf.py4j From 0d1e703c80eefddb27fcc4e7fb900228ce1c5f21 Mon Sep 17 00:00:00 2001 From: Ghislain Fourny Date: Fri, 26 Sep 2025 14:04:57 +0200 Subject: [PATCH 3/4] Update Spark 3.5 from 3.5.6 to 3.5.7 --- pom.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index b2ceb1956c..7f9c6f3d6c 100644 --- a/pom.xml +++ b/pom.xml @@ -276,19 +276,19 @@ org.apache.spark spark-core_2.13 - 3.5.6 + 3.5.7 provided org.apache.spark spark-sql_2.13 - 3.5.6 + 3.5.7 provided org.apache.spark spark-mllib_2.13 - 3.5.6 + 3.5.7 provided @@ -306,7 +306,7 @@ org.apache.spark spark-avro_2.13 - 3.5.6 + 3.5.7 org.antlr From da9a4465eb85043bb615be705032765206c374f1 Mon Sep 17 00:00:00 2001 From: Ghislain Fourny Date: Thu, 30 Oct 2025 16:54:52 +0100 Subject: [PATCH 4/4] Fix error. --- src/main/java/org/rumbledb/api/SequenceWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/rumbledb/api/SequenceWriter.java b/src/main/java/org/rumbledb/api/SequenceWriter.java index 641aaeca64..6bd85f3790 100644 --- a/src/main/java/org/rumbledb/api/SequenceWriter.java +++ b/src/main/java/org/rumbledb/api/SequenceWriter.java @@ -177,7 +177,7 @@ public SequenceWriter format(String source) { this.sequence, null, source, - (this.dataFrameWriter == null) ? this.mode : this.dataFrameWriter.curmode(), + (this.dataFrameWriter == null) ? this.mode : SaveMode.ErrorIfExists, (this.dataFrameWriter == null) ? this.outputFormatOptions : new HashMap<>(), this.configuration );