Skip to content

Commit 205669e

Browse files
authored
Merge branch 'develop' into develop
2 parents d0c38f2 + de97bd4 commit 205669e

File tree

21 files changed

+69
-103
lines changed

21 files changed

+69
-103
lines changed

.github/workflows/tests.yml

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
strategy:
1717
matrix:
1818
os: [ ubuntu-22.04 ]
19-
python-version: ["3.9", "3.10", "3.11", "3.12" ]
19+
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13" ]
2020
pandas: [ "pandas>1.1" ]
2121
numpy: [ "numpy>=1.21" ]
2222
runs-on: ${{ matrix.os }}
@@ -126,22 +126,26 @@ jobs:
126126
continue-on-error: false
127127
strategy:
128128
matrix:
129-
python-version: [ "3.9", "3.10", "3.11", "3.12" ]
130-
pyspark-version: [ "3.4" , "3.5" ]
129+
include:
130+
# Legacy line (Spark 3.5.x)
131+
- { python-version: "3.10", pyspark-version: "3.5" }
132+
- { python-version: "3.11", pyspark-version: "3.5" }
133+
# Current line (Spark 4.0.x)
134+
- { python-version: "3.11", pyspark-version: "4.0" }
135+
- { python-version: "3.10", pyspark-version: "4.0" }
136+
- { python-version: "3.12", pyspark-version: "4.0" }
131137

132138
name: Tests Spark | Python ${{ matrix.python-version }} | PySpark ${{ matrix.pyspark-version }}
133139

134140
steps:
135141
- name: Checkout Code
136142
uses: actions/checkout@v4
137143

138-
- name: Install Java (OpenJDK 11)
139-
run: |
140-
sudo apt-get update
141-
sudo apt-get install -y openjdk-11-jdk
142-
echo "JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64" >> $GITHUB_ENV
143-
echo "PATH=$JAVA_HOME/bin:$PATH" >> $GITHUB_ENV
144-
java -version
144+
- name: Setup Java 17 (Temurin)
145+
uses: actions/setup-java@v4
146+
with:
147+
distribution: temurin
148+
java-version: '17'
145149

146150
- name: Setup Python
147151
uses: actions/setup-python@v5
@@ -159,10 +163,15 @@ jobs:
159163
160164
- name: Install Dependencies
161165
run: |
162-
python -m pip install --upgrade pip setuptools wheel
163-
pip install pyarrow>4.0.0 pyspark=="${{ matrix.pyspark-version }}" --no-cache-dir
164-
echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV
166+
python -m pip install -U pip setuptools wheel
167+
pip install "pyspark~=${{ matrix.pyspark-version }}" "pyarrow>4.0.0" --no-cache-dir
168+
pip install ".[test]"
169+
# Make PySpark use this Python and bind locally; give it a safe tmp dir
170+
echo "PYSPARK_PYTHON=$(which python)" >> $GITHUB_ENV
171+
echo "PYSPARK_DRIVER_PYTHON=$(which python)" >> $GITHUB_ENV
165172
echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV
173+
echo "SPARK_LOCAL_DIRS=$RUNNER_TEMP/spark-tmp" >> $GITHUB_ENV
174+
mkdir -p "$RUNNER_TEMP/spark-tmp"
166175
167176
- name: Run Tests
168177
run: |

examples/features/spark_example.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414

1515
if __name__ == "__main__":
1616
spark_session = (
17-
SparkSession.builder.appName("SparkProfiling").master("local[*]").getOrCreate()
17+
SparkSession.builder.appName("SparkProfiling")
18+
.master("local[*]")
19+
.config("spark.sql.ansi.enabled", "false")
20+
.getOrCreate()
1821
)
1922

2023
print(spark_session.sparkContext.uiWebUrl) # noqa: T201

examples/meteorites/meteorites.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
if __name__ == "__main__":
1010
file_name = cache_file(
1111
"meteorites.csv",
12-
"https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
12+
"https://data.nasa.gov/docs/legacy/meteorite_landings/Meteorite_Landings.csv",
1313
)
1414

1515
# Set a seed for reproducibility

examples/meteorites/meteorites_cloud.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
"source": [
6161
"file_name = cache_file(\n",
6262
" \"meteorites.csv\",\n",
63-
" \"https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD\",\n",
63+
" \"https://data.nasa.gov/docs/legacy/meteorite_landings/Meteorite_Landings.csv\",\n",
6464
")\n",
6565
"\n",
6666
"df = pd.read_csv(file_name)\n",

pyproject.toml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ package_name = "ydata-profiling"
1111

1212
[project]
1313
name = "ydata-profiling"
14-
requires-python = ">=3.7,<3.13"
14+
requires-python = ">=3.7,<3.14"
1515
authors = [
1616
{name = "YData Labs Inc", email = "[email protected]"}
1717
]
@@ -51,7 +51,8 @@ dependencies = [
5151
"numpy>=1.16.0,<2.2",
5252
# Could be optional
5353
# Related to HTML report
54-
"htmlmin==0.1.12",
54+
"minify-html>=0.15.0",
55+
"filetype>=1.0.0",
5556
# Correlations
5657
"phik>=0.11.1,<0.13",
5758
# Examples
@@ -108,7 +109,7 @@ notebook = [
108109
# note that if you are using pyspark 2.3 or 2.4 and pyarrow >= 0.15, you might need to
109110
# set ARROW_PRE_0_15_IPC_FORMAT=1 in your conf/spark-env.sh for toPandas functions to work properly
110111
spark = [
111-
"pyspark>=3.0",
112+
"pyspark>=4.0",
112113
"pyarrow>=4.0.0",
113114
"pandas>1.1",
114115
"numpy>=1.16.0",

src/ydata_profiling/model/pandas/describe_image_pandas.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
import imghdr
21
from functools import partial
32
from pathlib import Path
43
from typing import Optional, Tuple, Union
54

5+
import filetype
66
import imagehash
77
import pandas as pd
88
from PIL import ExifTags, Image
@@ -12,7 +12,6 @@
1212
describe_image_1d,
1313
named_aggregate_summary,
1414
)
15-
from ydata_profiling.utils.imghdr_patch import * # noqa: F401,F403
1615

1716

1817
def open_image(path: Path) -> Optional[Image.Image]:
@@ -119,7 +118,8 @@ def extract_exif(image: Image) -> dict:
119118

120119

121120
def path_is_image(p: Path) -> bool:
122-
return imghdr.what(p) is not None
121+
guess = filetype.guess(str(p))
122+
return guess is not None and guess.mime.startswith("image/")
123123

124124

125125
def count_duplicate_hashes(image_descriptions: dict) -> int:

src/ydata_profiling/model/typeset.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import datetime
2-
import imghdr
32
import os
43
import warnings
54
from functools import partial, wraps
65
from typing import Callable, Sequence, Set
76
from urllib.parse import urlparse
87

8+
import filetype
99
import pandas as pd
1010
import visions
1111
from multimethod import multimethod
@@ -295,7 +295,11 @@ def get_relations() -> Sequence[TypeRelation]:
295295
@multimethod
296296
@series_handle_nulls
297297
def contains_op(series: pd.Series, state: dict) -> bool:
298-
return all(imghdr.what(p) for p in series)
298+
return all(
299+
filetype.guess(str(p))
300+
and filetype.guess(str(p)).mime.startswith("image/")
301+
for p in series
302+
)
299303

300304
class TimeSeries(visions.VisionsBaseType):
301305
@staticmethod

src/ydata_profiling/profile_report.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -451,9 +451,9 @@ def _render_html(self) -> str:
451451
)
452452

453453
if self.config.html.minify_html:
454-
from htmlmin.main import minify
454+
import minify_html
455455

456-
html = minify(html, remove_all_empty_space=True, remove_comments=True)
456+
html = minify_html.minify(html, keep_comments=False)
457457
pbar.update()
458458
return html
459459

src/ydata_profiling/utils/common.py

Lines changed: 1 addition & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@
88
import zipfile
99
from datetime import datetime, timedelta
1010

11-
# Monkeypatch bug in imagehdr
12-
from imghdr import tests
11+
# Image type detection
1312
from pathlib import Path
1413
from typing import Mapping
1514

@@ -64,35 +63,6 @@ def extract_zip(outfile, effective_path):
6463
raise ValueError("Bad zip file") from e
6564

6665

67-
def test_jpeg1(h, f):
68-
"""JPEG data in JFIF format"""
69-
if b"JFIF" in h[:23]:
70-
return "jpeg"
71-
72-
73-
JPEG_MARK = (
74-
b"\xff\xd8\xff\xdb\x00C\x00\x08\x06\x06"
75-
b"\x07\x06\x05\x08\x07\x07\x07\t\t\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f"
76-
)
77-
78-
79-
def test_jpeg2(h, f):
80-
"""JPEG with small header"""
81-
if len(h) >= 32 and h[5] == 67 and h[:32] == JPEG_MARK:
82-
return "jpeg"
83-
84-
85-
def test_jpeg3(h, f):
86-
"""JPEG data in JFIF or Exif format"""
87-
if h[6:10] in (b"JFIF", b"Exif") or h[:2] == b"\xff\xd8":
88-
return "jpeg"
89-
90-
91-
tests.append(test_jpeg1)
92-
tests.append(test_jpeg2)
93-
tests.append(test_jpeg3)
94-
95-
9666
def convert_timestamp_to_datetime(timestamp: int) -> datetime:
9767
if timestamp >= 0:
9868
return datetime.fromtimestamp(timestamp)

src/ydata_profiling/utils/imghdr_patch.py

Lines changed: 0 additions & 31 deletions
This file was deleted.

0 commit comments

Comments
 (0)