feat: update spark version for ydata-profiling (#1730)

fabclmnt · web-flow · commit bb9debc80796 · 2025-03-18T11:30:14.000-07:00
* chore: update test pipeline to run higher spark version

* chore: update spark test CI/CD pipeline.

* chore: fix spark CI/CD

* chore: remove make spark-ci

* chore: add pytest to the dependencies

* fix: fixing numba version due to visions

* feat: update pyspark install

* fix: pyproject

* chore: fix makefile to run the tests

* chore: tests for pyspark versions bigger than 3.4

* fix: add other pyspark versions to the tests

* chore: use ubuntu-22.04

* chore: update test pipeline to run higher spark version

* chore: update spark test CI/CD pipeline.

* chore: fix spark CI/CD

* chore: remove make spark-ci

* chore: add pytest to the dependencies

* fix: fixing numba version due to visions

* feat: update pyspark install

* fix: pyproject

* chore: fix makefile to run the tests

* chore: tests for pyspark versions bigger than 3.4

* fix: add other pyspark versions to the tests

* chore: use ubuntu-22.04

* chore: remove unused timestamp
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
@@ -9,7 +9,7 @@ on:
 jobs:
   commitlint:
     name: Lint commit message
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-22.04
 
     steps:
     - uses: actions/checkout@v4
@@ -21,7 +21,7 @@ jobs:
   lint:
     if: github.actor != 'renovate[bot]'
     name: Lint source code
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-22.04
 
     steps:
     - uses: actions/checkout@v4
@@ -85,7 +85,7 @@ jobs:
 
   validate-docs:
     name: Validate Docs
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-22.04
 
     steps:
     - uses: actions/checkout@v4
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -122,64 +122,51 @@ jobs:
     - run: codecov -F py${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.pandas }}-${{ matrix.numpy }}
 
   test_spark:
-    runs-on: ${{ matrix.os }}
-    continue-on-error: True
+    runs-on: ubuntu-22.04
+    continue-on-error: false
     strategy:
       matrix:
-        os: [ ubuntu-22.04 ]
-        python-version: [3.8]
-        pandas: ["pandas>1.1"]
-        spark: ["3.0.1"]
-        hadoop: [ 2.7 ]
-        numpy: ["numpy"]
-        java_home: [ /usr/lib/jvm/java-8-openjdk-amd64 ]
-
-    name: Tests Spark | python ${{ matrix.python-version }}, ${{ matrix.os }}, spark${{ matrix.spark }}, ${{ matrix.pandas }}, ${{ matrix.numpy }}
-    env:
-      JAVA_HOME: ${{ matrix.java_home }}
-      SPARK_VERSION: ${{ matrix.spark }}
-      HADOOP_VERSION: ${{ matrix.hadoop }}
-      SPARK_DIRECTORY: ${{ github.workspace }}/../
-      SPARK_HOME: ${{ github.workspace }}/../spark/
-      YDATA_PROFILING_NO_ANALYTICS: ${{ matrix.analytics }}
+        python-version: [ "3.9", "3.10", "3.11", "3.12" ]
+        pyspark-version: [ "3.4" , "3.5" ]
+
+    name: Tests Spark | Python ${{ matrix.python-version }} | PySpark ${{ matrix.pyspark-version }}
+
     steps:
-      - uses: actions/checkout@v4
-      - name: Setup python
+      - name: Checkout Code
+        uses: actions/checkout@v4
+
+      - name: Install Java (OpenJDK 11)
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y openjdk-11-jdk
+          echo "JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64" >> $GITHUB_ENV
+          echo "PATH=$JAVA_HOME/bin:$PATH" >> $GITHUB_ENV
+          java -version
+
+      - name: Setup Python
         uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
           architecture: x64
-      - uses: actions/cache@v4
-        if: startsWith(runner.os, 'Linux')
+
+      - name: Cache pip dependencies
+        uses: actions/cache@v4
         with:
           path: ~/.cache/pip
-          key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
+          key: pip-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pyspark-version }}-${{ hashFiles('requirements/*.txt', 'setup.cfg', 'pyproject.toml') }}
           restore-keys: |
-            ${{ runner.os }}-${{ matrix.pandas }}-pip-\
-      - uses: actions/cache@v4
-        if: startsWith(runner.os, 'macOS')
-        with:
-          path: ~/Library/Caches/pip
-          key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
-          restore-keys: |
-            ${{ runner.os }}-${{ matrix.pandas }}-pip-
-      - uses: actions/cache@v4
-        if: startsWith(runner.os, 'Windows')
-        with:
-          path: ~\AppData\Local\pip\Cache
-          key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
-          restore-keys: |
-            ${{ runner.os }}-${{ matrix.pandas }}-pip-
-      - run: |
-          pip install --upgrade pip setuptools wheel
-          pip install pytest-spark>=0.6.0 pyarrow==1.0.1 pyspark=="${{ matrix.spark }}"
+            pip-${{ runner.os }}-
+
+      - name: Install Dependencies
+        run: |
+          python -m pip install --upgrade pip setuptools wheel
+          pip install pyarrow>4.0.0 pyspark=="${{ matrix.pyspark-version }}" --no-cache-dir
+          echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV
+          echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV
+
+      - name: Run Tests
+        run: |
+          make install
           pip install ".[test]"
-          pip install "${{ matrix.pandas }}" "${{ matrix.numpy }}"
-      - if: ${{ matrix.spark != '3.0.1' }}
-        run: echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV
-      - run: echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV
-      - run: make install
-      - run: make install-spark-ci
-      - run: pip install ".[spark]" # Make sure the proper version of pandas is install after everything
-      - run: make test_spark
+          make test_spark
 
diff --git a/Makefile b/Makefile
@@ -10,7 +10,7 @@ test:
 	ydata_profiling -h
 
 test_spark:
-	pytest --spark_home=${SPARK_HOME} tests/backends/spark_backend/
+	pytest tests/backends/spark_backend/
 	ydata_profiling -h
 
 test_cov:
@@ -36,7 +36,7 @@ install-docs: install ### Installs regular and docs dependencies
 
 install-spark-ci:
 	sudo apt-get update
-	sudo apt-get -y install openjdk-8-jdk
+	sudo apt-get -y install openjdk-11-jdk
 	curl https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
 	--output ${SPARK_DIRECTORY}/spark.tgz
 	cd ${SPARK_DIRECTORY} && tar -xvzf spark.tgz && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark
diff --git a/pyproject.toml b/pyproject.toml
@@ -67,7 +67,7 @@ dependencies = [
     "imagehash==4.3.1",
     "wordcloud>=1.9.3",
     "dacite>=1.8",
-    "numba>=0.56.0, <1",
+    "numba>=0.56.0, <=0.61",
 ]
 
 dynamic = [
@@ -108,10 +108,10 @@ notebook = [
 # note that if you are using pyspark 2.3 or 2.4 and pyarrow >= 0.15, you might need to
 # set ARROW_PRE_0_15_IPC_FORMAT=1 in your conf/spark-env.sh for toPandas functions to work properly
 spark = [
-    "pyspark>=2.3.0",
-    "pyarrow>=2.0.0",
-    "pandas>1.1, <2, !=1.4.0",
-    "numpy>=1.16.0,<1.24",
+    "pyspark>=3.0",
+    "pyarrow>=4.0.0",
+    "pandas>1.1",
+    "numpy>=1.16.0",
     "visions[type_image_path]>=0.7.5, <0.7.7",
 ]
 
diff --git a/tests/backends/spark_backend/test_descriptions_spark.py b/tests/backends/spark_backend/test_descriptions_spark.py
@@ -41,15 +41,15 @@ def describe_data():
         "s1": np.ones(9),
         "s2": ["some constant text $ % value {obj} " for _ in range(1, 10)],
         "somedate": [
-            datetime.datetime(2011, 7, 4),
-            datetime.datetime(2022, 1, 1, 13, 57),
-            datetime.datetime(1990, 12, 9),
+            datetime.date(2011, 7, 4),
+            datetime.date(2011, 7, 2),
+            datetime.date(1990, 12, 9),
             pd.NaT,
-            datetime.datetime(1990, 12, 9),
-            datetime.datetime(1970, 12, 9),
-            datetime.datetime(1972, 1, 2),
-            datetime.datetime(1970, 12, 9),
-            datetime.datetime(1970, 12, 9),
+            datetime.date(1990, 12, 9),
+            datetime.date(1970, 12, 9),
+            datetime.date(1972, 1, 2),
+            datetime.date(1970, 12, 9),
+            datetime.date(1970, 12, 9),
         ],
         "bool_tf": [True, True, False, True, False, True, True, False, True],
         "bool_tf_with_nan": [
@@ -370,13 +370,14 @@ def test_describe_spark_df(
 
     if column == "mixed":
         describe_data[column] = [str(i) for i in describe_data[column]]
-    if column == "bool_tf_with_nan":
+    elif column == "bool_tf_with_nan":
         describe_data[column] = [
             True if i else False for i in describe_data[column]  # noqa: SIM210
         ]
     pdf = pd.DataFrame({column: describe_data[column]})  # Convert to Pandas DataFrame
     # Ensure NaNs are replaced with None (Spark does not support NaN in non-float columns)
     pdf = pdf.where(pd.notna(pdf), None)
+
     sdf = spark_session.createDataFrame(pdf)
 
     results = describe(cfg, sdf, summarizer_spark, typeset)