Skip to content

Commit bb9debc

Browse files
authored
feat: update spark version for ydata-profiling (#1730)
* chore: update test pipeline to run higher spark version * chore: update spark test CI/CD pipeline. * chore: fix spark CI/CD * chore: remove make spark-ci * chore: add pytest to the dependencies * fix: fixing numba version due to visions * feat: update pyspark install * fix: pyproject * chore: fix makefile to run the tests * chore: tests for pyspark versions bigger than 3.4 * fix: add other pyspark versions to the tests * chore: use ubuntu-22.04 * chore: update test pipeline to run higher spark version * chore: update spark test CI/CD pipeline. * chore: fix spark CI/CD * chore: remove make spark-ci * chore: add pytest to the dependencies * fix: fixing numba version due to visions * feat: update pyspark install * fix: pyproject * chore: fix makefile to run the tests * chore: tests for pyspark versions bigger than 3.4 * fix: add other pyspark versions to the tests * chore: use ubuntu-22.04 * chore: remove unused timestamp
1 parent ae0e2e3 commit bb9debc

File tree

5 files changed

+56
-68
lines changed

5 files changed

+56
-68
lines changed

.github/workflows/pull-request.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ on:
99
jobs:
1010
commitlint:
1111
name: Lint commit message
12-
runs-on: ubuntu-24.04
12+
runs-on: ubuntu-22.04
1313

1414
steps:
1515
- uses: actions/checkout@v4
@@ -21,7 +21,7 @@ jobs:
2121
lint:
2222
if: github.actor != 'renovate[bot]'
2323
name: Lint source code
24-
runs-on: ubuntu-24.04
24+
runs-on: ubuntu-22.04
2525

2626
steps:
2727
- uses: actions/checkout@v4
@@ -85,7 +85,7 @@ jobs:
8585

8686
validate-docs:
8787
name: Validate Docs
88-
runs-on: ubuntu-24.04
88+
runs-on: ubuntu-22.04
8989

9090
steps:
9191
- uses: actions/checkout@v4

.github/workflows/tests.yml

Lines changed: 36 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -122,64 +122,51 @@ jobs:
122122
- run: codecov -F py${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.pandas }}-${{ matrix.numpy }}
123123

124124
test_spark:
125-
runs-on: ${{ matrix.os }}
126-
continue-on-error: True
125+
runs-on: ubuntu-22.04
126+
continue-on-error: false
127127
strategy:
128128
matrix:
129-
os: [ ubuntu-22.04 ]
130-
python-version: [3.8]
131-
pandas: ["pandas>1.1"]
132-
spark: ["3.0.1"]
133-
hadoop: [ 2.7 ]
134-
numpy: ["numpy"]
135-
java_home: [ /usr/lib/jvm/java-8-openjdk-amd64 ]
136-
137-
name: Tests Spark | python ${{ matrix.python-version }}, ${{ matrix.os }}, spark${{ matrix.spark }}, ${{ matrix.pandas }}, ${{ matrix.numpy }}
138-
env:
139-
JAVA_HOME: ${{ matrix.java_home }}
140-
SPARK_VERSION: ${{ matrix.spark }}
141-
HADOOP_VERSION: ${{ matrix.hadoop }}
142-
SPARK_DIRECTORY: ${{ github.workspace }}/../
143-
SPARK_HOME: ${{ github.workspace }}/../spark/
144-
YDATA_PROFILING_NO_ANALYTICS: ${{ matrix.analytics }}
129+
python-version: [ "3.9", "3.10", "3.11", "3.12" ]
130+
pyspark-version: [ "3.4" , "3.5" ]
131+
132+
name: Tests Spark | Python ${{ matrix.python-version }} | PySpark ${{ matrix.pyspark-version }}
133+
145134
steps:
146-
- uses: actions/checkout@v4
147-
- name: Setup python
135+
- name: Checkout Code
136+
uses: actions/checkout@v4
137+
138+
- name: Install Java (OpenJDK 11)
139+
run: |
140+
sudo apt-get update
141+
sudo apt-get install -y openjdk-11-jdk
142+
echo "JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64" >> $GITHUB_ENV
143+
echo "PATH=$JAVA_HOME/bin:$PATH" >> $GITHUB_ENV
144+
java -version
145+
146+
- name: Setup Python
148147
uses: actions/setup-python@v5
149148
with:
150149
python-version: ${{ matrix.python-version }}
151150
architecture: x64
152-
- uses: actions/cache@v4
153-
if: startsWith(runner.os, 'Linux')
151+
152+
- name: Cache pip dependencies
153+
uses: actions/cache@v4
154154
with:
155155
path: ~/.cache/pip
156-
key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
156+
key: pip-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pyspark-version }}-${{ hashFiles('requirements/*.txt', 'setup.cfg', 'pyproject.toml') }}
157157
restore-keys: |
158-
${{ runner.os }}-${{ matrix.pandas }}-pip-\
159-
- uses: actions/cache@v4
160-
if: startsWith(runner.os, 'macOS')
161-
with:
162-
path: ~/Library/Caches/pip
163-
key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
164-
restore-keys: |
165-
${{ runner.os }}-${{ matrix.pandas }}-pip-
166-
- uses: actions/cache@v4
167-
if: startsWith(runner.os, 'Windows')
168-
with:
169-
path: ~\AppData\Local\pip\Cache
170-
key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
171-
restore-keys: |
172-
${{ runner.os }}-${{ matrix.pandas }}-pip-
173-
- run: |
174-
pip install --upgrade pip setuptools wheel
175-
pip install pytest-spark>=0.6.0 pyarrow==1.0.1 pyspark=="${{ matrix.spark }}"
158+
pip-${{ runner.os }}-
159+
160+
- name: Install Dependencies
161+
run: |
162+
python -m pip install --upgrade pip setuptools wheel
163+
pip install pyarrow>4.0.0 pyspark=="${{ matrix.pyspark-version }}" --no-cache-dir
164+
echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV
165+
echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV
166+
167+
- name: Run Tests
168+
run: |
169+
make install
176170
pip install ".[test]"
177-
pip install "${{ matrix.pandas }}" "${{ matrix.numpy }}"
178-
- if: ${{ matrix.spark != '3.0.1' }}
179-
run: echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV
180-
- run: echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV
181-
- run: make install
182-
- run: make install-spark-ci
183-
- run: pip install ".[spark]" # Make sure the proper version of pandas is install after everything
184-
- run: make test_spark
171+
make test_spark
185172

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ test:
1010
ydata_profiling -h
1111

1212
test_spark:
13-
pytest --spark_home=${SPARK_HOME} tests/backends/spark_backend/
13+
pytest tests/backends/spark_backend/
1414
ydata_profiling -h
1515

1616
test_cov:
@@ -36,7 +36,7 @@ install-docs: install ### Installs regular and docs dependencies
3636

3737
install-spark-ci:
3838
sudo apt-get update
39-
sudo apt-get -y install openjdk-8-jdk
39+
sudo apt-get -y install openjdk-11-jdk
4040
curl https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
4141
--output ${SPARK_DIRECTORY}/spark.tgz
4242
cd ${SPARK_DIRECTORY} && tar -xvzf spark.tgz && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark

pyproject.toml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ dependencies = [
6767
"imagehash==4.3.1",
6868
"wordcloud>=1.9.3",
6969
"dacite>=1.8",
70-
"numba>=0.56.0, <1",
70+
"numba>=0.56.0, <=0.61",
7171
]
7272

7373
dynamic = [
@@ -108,10 +108,10 @@ notebook = [
108108
# note that if you are using pyspark 2.3 or 2.4 and pyarrow >= 0.15, you might need to
109109
# set ARROW_PRE_0_15_IPC_FORMAT=1 in your conf/spark-env.sh for toPandas functions to work properly
110110
spark = [
111-
"pyspark>=2.3.0",
112-
"pyarrow>=2.0.0",
113-
"pandas>1.1, <2, !=1.4.0",
114-
"numpy>=1.16.0,<1.24",
111+
"pyspark>=3.0",
112+
"pyarrow>=4.0.0",
113+
"pandas>1.1",
114+
"numpy>=1.16.0",
115115
"visions[type_image_path]>=0.7.5, <0.7.7",
116116
]
117117

tests/backends/spark_backend/test_descriptions_spark.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,15 @@ def describe_data():
4141
"s1": np.ones(9),
4242
"s2": ["some constant text $ % value {obj} " for _ in range(1, 10)],
4343
"somedate": [
44-
datetime.datetime(2011, 7, 4),
45-
datetime.datetime(2022, 1, 1, 13, 57),
46-
datetime.datetime(1990, 12, 9),
44+
datetime.date(2011, 7, 4),
45+
datetime.date(2011, 7, 2),
46+
datetime.date(1990, 12, 9),
4747
pd.NaT,
48-
datetime.datetime(1990, 12, 9),
49-
datetime.datetime(1970, 12, 9),
50-
datetime.datetime(1972, 1, 2),
51-
datetime.datetime(1970, 12, 9),
52-
datetime.datetime(1970, 12, 9),
48+
datetime.date(1990, 12, 9),
49+
datetime.date(1970, 12, 9),
50+
datetime.date(1972, 1, 2),
51+
datetime.date(1970, 12, 9),
52+
datetime.date(1970, 12, 9),
5353
],
5454
"bool_tf": [True, True, False, True, False, True, True, False, True],
5555
"bool_tf_with_nan": [
@@ -370,13 +370,14 @@ def test_describe_spark_df(
370370

371371
if column == "mixed":
372372
describe_data[column] = [str(i) for i in describe_data[column]]
373-
if column == "bool_tf_with_nan":
373+
elif column == "bool_tf_with_nan":
374374
describe_data[column] = [
375375
True if i else False for i in describe_data[column] # noqa: SIM210
376376
]
377377
pdf = pd.DataFrame({column: describe_data[column]}) # Convert to Pandas DataFrame
378378
# Ensure NaNs are replaced with None (Spark does not support NaN in non-float columns)
379379
pdf = pdf.where(pd.notna(pdf), None)
380+
380381
sdf = spark_session.createDataFrame(pdf)
381382

382383
results = describe(cfg, sdf, summarizer_spark, typeset)

0 commit comments

Comments
 (0)