Merge branch 'develop' into develop

landonzeng · web-flow · commit 205669e1b51e · 2025-10-08T22:40:43.000+08:00
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         os: [ ubuntu-22.04 ]
-        python-version: ["3.9", "3.10", "3.11", "3.12" ]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13" ]
         pandas: [ "pandas>1.1" ]
         numpy: [ "numpy>=1.21" ]
     runs-on: ${{ matrix.os }}
@@ -126,22 +126,26 @@ jobs:
     continue-on-error: false
     strategy:
       matrix:
-        python-version: [ "3.9", "3.10", "3.11", "3.12" ]
-        pyspark-version: [ "3.4" , "3.5" ]
+        include:
+          # Legacy line (Spark 3.5.x)
+          - { python-version: "3.10", pyspark-version: "3.5" }
+          - { python-version: "3.11", pyspark-version: "3.5" }
+          # Current line (Spark 4.0.x)
+          - { python-version: "3.11", pyspark-version: "4.0" }
+          - { python-version: "3.10", pyspark-version: "4.0" }
+          - { python-version: "3.12", pyspark-version: "4.0" }
 
     name: Tests Spark | Python ${{ matrix.python-version }} | PySpark ${{ matrix.pyspark-version }}
 
     steps:
       - name: Checkout Code
         uses: actions/checkout@v4
 
-      - name: Install Java (OpenJDK 11)
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y openjdk-11-jdk
-          echo "JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64" >> $GITHUB_ENV
-          echo "PATH=$JAVA_HOME/bin:$PATH" >> $GITHUB_ENV
-          java -version
+      - name: Setup Java 17 (Temurin)
+        uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: '17'
 
       - name: Setup Python
         uses: actions/setup-python@v5
@@ -159,10 +163,15 @@ jobs:
 
       - name: Install Dependencies
         run: |
-          python -m pip install --upgrade pip setuptools wheel
-          pip install pyarrow>4.0.0 pyspark=="${{ matrix.pyspark-version }}" --no-cache-dir
-          echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV
+          python -m pip install -U pip setuptools wheel
+          pip install "pyspark~=${{ matrix.pyspark-version }}" "pyarrow>4.0.0" --no-cache-dir
+          pip install ".[test]"
+          # Make PySpark use this Python and bind locally; give it a safe tmp dir
+          echo "PYSPARK_PYTHON=$(which python)" >> $GITHUB_ENV
+          echo "PYSPARK_DRIVER_PYTHON=$(which python)" >> $GITHUB_ENV
           echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV
+          echo "SPARK_LOCAL_DIRS=$RUNNER_TEMP/spark-tmp" >> $GITHUB_ENV
+          mkdir -p "$RUNNER_TEMP/spark-tmp"
 
       - name: Run Tests
         run: |
diff --git a/examples/features/spark_example.py b/examples/features/spark_example.py
@@ -14,7 +14,10 @@
 
 if __name__ == "__main__":
     spark_session = (
-        SparkSession.builder.appName("SparkProfiling").master("local[*]").getOrCreate()
+        SparkSession.builder.appName("SparkProfiling")
+        .master("local[*]")
+        .config("spark.sql.ansi.enabled", "false")
+        .getOrCreate()
     )
 
     print(spark_session.sparkContext.uiWebUrl)  # noqa: T201
diff --git a/examples/meteorites/meteorites.py b/examples/meteorites/meteorites.py
@@ -9,7 +9,7 @@
 if __name__ == "__main__":
     file_name = cache_file(
         "meteorites.csv",
-        "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
+        "https://data.nasa.gov/docs/legacy/meteorite_landings/Meteorite_Landings.csv",
     )
 
     # Set a seed for reproducibility
diff --git a/examples/meteorites/meteorites_cloud.ipynb b/examples/meteorites/meteorites_cloud.ipynb
@@ -60,7 +60,7 @@
    "source": [
     "file_name = cache_file(\n",
     "    \"meteorites.csv\",\n",
-    "    \"https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD\",\n",
+    "    \"https://data.nasa.gov/docs/legacy/meteorite_landings/Meteorite_Landings.csv\",\n",
     ")\n",
     "\n",
     "df = pd.read_csv(file_name)\n",
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,7 @@ package_name = "ydata-profiling"
 
 [project]
 name = "ydata-profiling"
-requires-python = ">=3.7,<3.13"
+requires-python = ">=3.7,<3.14"
 authors = [
     {name = "YData Labs Inc", email = "opensource@ydata.ai"}
 ]
@@ -51,7 +51,8 @@ dependencies = [
     "numpy>=1.16.0,<2.2",
     # Could be optional
     # Related to HTML report
-    "htmlmin==0.1.12",
+    "minify-html>=0.15.0",
+    "filetype>=1.0.0",
     # Correlations
     "phik>=0.11.1,<0.13",
     # Examples
@@ -108,7 +109,7 @@ notebook = [
 # note that if you are using pyspark 2.3 or 2.4 and pyarrow >= 0.15, you might need to
 # set ARROW_PRE_0_15_IPC_FORMAT=1 in your conf/spark-env.sh for toPandas functions to work properly
 spark = [
-    "pyspark>=3.0",
+    "pyspark>=4.0",
     "pyarrow>=4.0.0",
     "pandas>1.1",
     "numpy>=1.16.0",
diff --git a/src/ydata_profiling/model/pandas/describe_image_pandas.py b/src/ydata_profiling/model/pandas/describe_image_pandas.py
@@ -1,8 +1,8 @@
-import imghdr
 from functools import partial
 from pathlib import Path
 from typing import Optional, Tuple, Union
 
+import filetype
 import imagehash
 import pandas as pd
 from PIL import ExifTags, Image
@@ -12,7 +12,6 @@
     describe_image_1d,
     named_aggregate_summary,
 )
-from ydata_profiling.utils.imghdr_patch import *  # noqa: F401,F403
 
 
 def open_image(path: Path) -> Optional[Image.Image]:
@@ -119,7 +118,8 @@ def extract_exif(image: Image) -> dict:
 
 
 def path_is_image(p: Path) -> bool:
-    return imghdr.what(p) is not None
+    guess = filetype.guess(str(p))
+    return guess is not None and guess.mime.startswith("image/")
 
 
 def count_duplicate_hashes(image_descriptions: dict) -> int:
diff --git a/src/ydata_profiling/model/typeset.py b/src/ydata_profiling/model/typeset.py
@@ -1,11 +1,11 @@
 import datetime
-import imghdr
 import os
 import warnings
 from functools import partial, wraps
 from typing import Callable, Sequence, Set
 from urllib.parse import urlparse
 
+import filetype
 import pandas as pd
 import visions
 from multimethod import multimethod
@@ -295,7 +295,11 @@ def get_relations() -> Sequence[TypeRelation]:
         @multimethod
         @series_handle_nulls
         def contains_op(series: pd.Series, state: dict) -> bool:
-            return all(imghdr.what(p) for p in series)
+            return all(
+                filetype.guess(str(p))
+                and filetype.guess(str(p)).mime.startswith("image/")
+                for p in series
+            )
 
     class TimeSeries(visions.VisionsBaseType):
         @staticmethod
diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py
@@ -451,9 +451,9 @@ def _render_html(self) -> str:
             )
 
             if self.config.html.minify_html:
-                from htmlmin.main import minify
+                import minify_html
 
-                html = minify(html, remove_all_empty_space=True, remove_comments=True)
+                html = minify_html.minify(html, keep_comments=False)
             pbar.update()
         return html
 
diff --git a/src/ydata_profiling/utils/common.py b/src/ydata_profiling/utils/common.py
@@ -8,8 +8,7 @@
 import zipfile
 from datetime import datetime, timedelta
 
-# Monkeypatch bug in imagehdr
-from imghdr import tests
+# Image type detection
 from pathlib import Path
 from typing import Mapping
 
@@ -64,35 +63,6 @@ def extract_zip(outfile, effective_path):
         raise ValueError("Bad zip file") from e
 
 
-def test_jpeg1(h, f):
-    """JPEG data in JFIF format"""
-    if b"JFIF" in h[:23]:
-        return "jpeg"
-
-
-JPEG_MARK = (
-    b"\xff\xd8\xff\xdb\x00C\x00\x08\x06\x06"
-    b"\x07\x06\x05\x08\x07\x07\x07\t\t\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f"
-)
-
-
-def test_jpeg2(h, f):
-    """JPEG with small header"""
-    if len(h) >= 32 and h[5] == 67 and h[:32] == JPEG_MARK:
-        return "jpeg"
-
-
-def test_jpeg3(h, f):
-    """JPEG data in JFIF or Exif format"""
-    if h[6:10] in (b"JFIF", b"Exif") or h[:2] == b"\xff\xd8":
-        return "jpeg"
-
-
-tests.append(test_jpeg1)
-tests.append(test_jpeg2)
-tests.append(test_jpeg3)
-
-
 def convert_timestamp_to_datetime(timestamp: int) -> datetime:
     if timestamp >= 0:
         return datetime.fromtimestamp(timestamp)
diff --git a/src/ydata_profiling/utils/imghdr_patch.py b/src/ydata_profiling/utils/imghdr_patch.py
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -83,7 +83,12 @@ def spark_context():
     if not has_spark:
         pytest.skip("Skipping Spark tests because PySpark is not installed.")
 
-    conf = SparkConf().setAppName("pytest-pyspark-tests").setMaster("local[*]")
+    conf = (
+        SparkConf()
+        .setAppName("pytest-pyspark-tests")
+        .setMaster("local[*]")
+        .set("spark.sql.ansi.enabled", "false")
+    )
 
     # Check if SparkContext exists before creating a new one
     if SparkContext._active_spark_context:
@@ -105,7 +110,12 @@ def spark_session(spark_context):
     """
     if not has_spark:
         pytest.skip("Skipping Spark tests because PySpark is not installed.")
-    spark = SparkSession.builder.config(conf=spark_context.getConf()).getOrCreate()
+    spark = (
+        SparkSession.builder.master("local[*]")
+        .appName("pytest")
+        .config("spark.sql.ansi.enabled", "false")  # <-- restore permissive casts
+        .getOrCreate()
+    )
 
     yield spark
 
diff --git a/tests/issues/test_issue147.py b/tests/issues/test_issue147.py
@@ -19,4 +19,4 @@ def test_issue147(get_data_file):
     )
     html = report.to_html()
     assert type(html) == str
-    assert "Dataset statistics</p>" in html
+    assert "Dataset statistics" in html
diff --git a/tests/issues/test_issue169.py b/tests/issues/test_issue169.py
@@ -32,7 +32,7 @@ def test_issue_169_column(issue_169_data):
     )
     html = report.to_html()
     assert type(html) == str
-    assert "Dataset statistics</p>" in html
+    assert "Dataset statistics" in html
 
 
 def test_issue_169_index(issue_169_data):
@@ -45,4 +45,4 @@ def test_issue_169_index(issue_169_data):
     )
     html = report.to_html()
     assert type(html) == str
-    assert "Dataset statistics</p>" in html
+    assert "Dataset statistics" in html
diff --git a/tests/notebooks/meteorites.ipynb b/tests/notebooks/meteorites.ipynb
@@ -33,7 +33,7 @@
    "source": [
     "file_name = cache_file(\n",
     "    \"meteorites.csv\",\n",
-    "    \"https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD\",\n",
+    "    \"https://data.nasa.gov/docs/legacy/meteorite_landings/Meteorite_Landings.csv\",\n",
     ")\n",
     "\n",
     "df = pd.read_csv(file_name)\n",
diff --git a/tests/unit/test_console.py b/tests/unit/test_console.py
@@ -10,7 +10,7 @@
 def console_data(get_data_file):
     return get_data_file(
         "meteorites.csv",
-        "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
+        "https://data.nasa.gov/docs/legacy/meteorite_landings/Meteorite_Landings.csv",
     )
 
 
diff --git a/tests/unit/test_dataset_schema.py b/tests/unit/test_dataset_schema.py
@@ -28,7 +28,7 @@ def test_dataset_schema():
     assert ">Dataset<" in html
     for key in metadata.keys():
         if not key.startswith("copyright_") and key != "url":
-            assert f"<th>{key.capitalize()}</th>" in html
+            assert f"<th>{key.capitalize()}<td" in html
     assert "<tr><th>Copyright</th><td>(c) RandoCorp LLC 2020</td></tr>"
     assert '<tr><th>URL</th><td><a href="http://www.dataset-sources.com/data/dataset.dat">http://www.dataset-sources.com/data/dataset.dat</a></td></tr>'
     assert ">Reproduction<" in html
diff --git a/tests/unit/test_example.py b/tests/unit/test_example.py
@@ -8,7 +8,7 @@
 def test_example(get_data_file, test_output_dir):
     file_name = get_data_file(
         "meteorites.csv",
-        "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
+        "https://data.nasa.gov/docs/legacy/meteorite_landings/Meteorite_Landings.csv",
     )
 
     # For reproducibility
diff --git a/tests/unit/test_modular.py b/tests/unit/test_modular.py
@@ -11,7 +11,7 @@
 def tdf(get_data_file):
     file_name = get_data_file(
         "meteorites.csv",
-        "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
+        "https://data.nasa.gov/docs/legacy/meteorite_landings/Meteorite_Landings.csv",
     )
 
     df = pd.read_csv(file_name)
diff --git a/tests/unit/test_serialize.py b/tests/unit/test_serialize.py
@@ -9,7 +9,7 @@
 def test_load(get_data_file, test_output_dir):
     file_name = get_data_file(
         "meteorites.csv",
-        "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
+        "https://data.nasa.gov/docs/legacy/meteorite_landings/Meteorite_Landings.csv",
     )
 
     # For reproducibility
diff --git a/tests/unit/test_time_series.py b/tests/unit/test_time_series.py
@@ -49,10 +49,10 @@ def sample_ts_df():
 
 
 def test_timeseries_identification(html_profile: str):
-    assert "<th>TimeSeries</th>" in html_profile, "TimeSeries not detected"
+
+    assert "<th>TimeSeries<td" in html_profile, "TimeSeries not detected"
     assert (
-        '<tr><th>TimeSeries</th><td style="white-space: nowrap;">8</td></tr>'
-        in html_profile
+        'TimeSeries<td style="white-space: nowrap;">8' in html_profile
     ), "TimeSeries incorrectly identified"
 
 
diff --git a/tests/unit/test_url.py b/tests/unit/test_url.py
@@ -25,4 +25,4 @@ def test_urls(get_data_file):
     )
 
     assert "URL</span>" in profile.to_html(), "URL not detected"
-    assert "URL</th>" in profile.to_html(), "URL not detected"
+    assert "<th>URL<td" in profile.to_html(), "URL not detected"

Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`	`if __name__ == "__main__":`
`10`	`10`	`file_name = cache_file(`
`11`	`11`	`"meteorites.csv",`
`12`		`- "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",`
	`12`	`+ "https://data.nasa.gov/docs/legacy/meteorite_landings/Meteorite_Landings.csv",`
`13`	`13`	`)`
`14`	`14`
`15`	`15`	`# Set a seed for reproducibility`
Original file line number	Diff line number	Diff line change
`@@ -451,9 +451,9 @@ def _render_html(self) -> str:`
`451`	`451`	`)`
`452`	`452`
`453`	`453`	`if self.config.html.minify_html:`
`454`		`- from htmlmin.main import minify`
	`454`	`+ import minify_html`
`455`	`455`
`456`		`- html = minify(html, remove_all_empty_space=True, remove_comments=True)`
	`456`	`+ html = minify_html.minify(html, keep_comments=False)`
`457`	`457`	`pbar.update()`
`458`	`458`	`return html`
`459`	`459`