apache · FelixYBW · May 2, 2025 · May 2, 2025 · May 6, 2025 · May 17, 2025
diff --git a/dev/docker/perf/Dockfile.centos9-run b/dev/docker/perf/Dockfile.centos9-run
@@ -0,0 +1,115 @@
+FROM quay.io/centos/centos:stream9
+
+ARG ARG_SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
+ARG ARG_SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz.asc
+ARG ARG_GPG_KEY=D76E23B9F11B5BF6864613C4F7051850A0AF904D
+ARG ARG_ICEBERG_URL=https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.5.0/iceberg-spark-runtime-3.5_2.12-1.5.0.jar
+
+COPY requirements.txt /opt/
+
+
+RUN set -ex; \
+    useradd spark; \
+    dnf install -y bash vim libssh gnupg2 hostname java-17-openjdk less net-tools procps python3 sudo util-linux wget which pip; \
+    dnf install -y epel-release; \
+    dnf install -y jemalloc; \
+    ln -s $(which python3) /usr/bin/python; \
+    pip install -r /opt/requirements.txt; \
+    python3 -m pip install --upgrade jsonschema; \
+    rm -rf /opt/requirements.txt; \
+    dnf clean all; \
+    mkdir -p /opt/spark/work-dir; \
+    mkdir -p /opt/spark/work-dir/ipython/output; \
+    mkdir -p /opt/spark/work-dir/ipython/result; \
+    mkdir -p /opt/spark/work-dir/ipython/analysis; \
+    chmod g+w /opt/spark/work-dir; \
+    mkdir -p /opt/spark/events; \
+    chmod g+w /opt/spark/events; \
+    export SPARK_TMP="$(mktemp -d)"; \
+    cd $SPARK_TMP; \
+    wget -nv -O spark.tgz "$ARG_SPARK_TGZ_URL"; \
+    wget -nv -O spark.tgz.asc "$ARG_SPARK_TGZ_ASC_URL"; \
+    export GNUPGHOME="$(mktemp -d)"; \
+    gpg --batch --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || gpg --batch --keyserver hkps://keyserver.ubuntu.com --recv-keys "$ARG_GPG_KEY"; \
+    gpg --batch --verify spark.tgz.asc spark.tgz; \
+    gpgconf --kill all; \
+    rm -rf "$GNUPGHOME" spark.tgz.asc; \
+    tar -xf spark.tgz --strip-components=1; \
+    chown -R spark:spark .; \
+    mv RELEASE  /opt/spark/; \
+    mv bin      /opt/spark/; \
+    mv conf     /opt/spark/; \
+    mv data     /opt/spark/; \
+    mv examples /opt/spark/; \
+    mv jars     /opt/spark/; \
+    mv python   /opt/spark/; \
+    mv sbin     /opt/spark/; \
+    cd ..; \
+    rm -rf "$SPARK_TMP"; \
+    cd /opt/spark/jars; \
+    wget -nv https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar; \
+    wget -nv https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar; \
+    wget -nv $ARG_ICEBERG_URL; \
+    cd /opt/spark; \
+    chown -R spark:spark .; \
+    echo 'spark ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers; \
+    echo 'alias ll="ls -alh"' >> /etc/bashrc
+
+
+COPY entrypoint.sh /opt/
+COPY ipython /opt/spark/work-dir/ipython
+COPY telegraf.conf /etc/telegraf/telegraf.conf
+COPY spark-defaults.conf /opt/spark/conf//spark-defaults.conf 
+
+RUN echo "/usr/lib64/gluten-libs" > /etc/ld.so.conf.d/gluten.conf && ldconfig
+
+ENV SPARK_HOME=/opt/spark \
+    PATH=$SPARK_HOME/bin:$PATH
+
+WORKDIR /opt/spark/work-dir
+
+USER spark
+RUN jupyter notebook --generate-config; \
+    mkdir -p ~/.jupyter/custom/; \
+    echo '.container { width:100% !important; }' >> ~/.jupyter/custom/custom.css; \
+    echo 'div.output_stderr { background: #ffdd; display: none; }'  >> ~/.jupyter/custom/custom.css; \
+    echo '.CodeMirror{font-family: "Courier New";font-size: 12pt;}' >> ~/.jupyter/custom/custom.css; \
+    echo '.rendered_html table, .rendered_html td, .rendered_html th {font-size: 20px;}' >> ~/.jupyter/custom/custom.css; \
+    echo '{ "NotebookApp": { "password": "argon2:$argon2id$v=19$m=10240,t=10,p=8$JL+YUwQ5lsg9rFXeCpBfgQ$Xv2GOkdfxoe41FaedCHVAOvxisbd9i3Xjn+KkDtBD44" } } ' > ~/.jupyter/jupyter_notebook_config.json; \
+    jupyter nbextension install --py jupyter_highlight_selected_word --user; \
+    jupyter nbextension enable highlight_selected_word/main; \
+    jupyter nbextension install --py widgetsnbextension --user; \
+    jupyter contrib nbextension install --user; \
+    jupyter nbextension enable codefolding/main; \
+    jupyter nbextension enable code_prettify/code_prettify; \
+    jupyter nbextension enable codefolding/edit; \
+    jupyter nbextension enable code_font_size/code_font_size; \
+    jupyter nbextension enable collapsible_headings/main; \
+    jupyter nbextension enable highlight_selected_word/main; \
+    jupyter nbextension enable ipyparallel/main; \
+    jupyter nbextension enable move_selected_cells/main; \
+    jupyter nbextension enable nbTranslate/main; \
+    jupyter nbextension enable scratchpad/main; \
+    jupyter nbextension enable tree-filter/index; \
+    jupyter nbextension enable comment-uncomment/main; \
+    jupyter nbextension enable export_embedded/main; \
+    jupyter nbextension enable hide_header/main; \
+    jupyter nbextension enable highlighter/highlighter; \
+    jupyter nbextension enable scroll_down/main; \
+    jupyter nbextension enable snippets/main; \
+    jupyter nbextension enable toc2/main; \
+    jupyter nbextension enable varInspector/main; \
+    jupyter nbextension enable codefolding/edit; \
+    jupyter nbextension enable contrib_nbextensions_help_item/main; \
+    jupyter nbextension enable freeze/main; \
+    jupyter nbextension enable hide_input/main; \
+    jupyter nbextension enable jupyter-js-widgets/extension; \
+    jupyter nbextension enable snippets_menu/main; \
+    jupyter nbextension enable table_beautifier/main; \
+    jupyter nbextension enable hide_input_all/main; \
+    jupyter nbextension enable spellchecker/main; \
+    jupyter nbextension enable toggle_all_line_numbers/main; \
+    jupyter nbextensions_configurator enable --user;
+
+
+ENTRYPOINT ["/opt/entrypoint.sh"]
diff --git a/dev/docker/perf/build.sh b/dev/docker/perf/build.sh
@@ -0,0 +1,32 @@
+pushd ~
+mkdir jars
+mkdir shared-libs
+mkdir tpch_sf100_parquet_zstd
+
+disk_names=( $(sudo lsblk -d -o NAME | tail -n +2) )
+data_id=1
+for disk in "${disk_names[@]}"; do
+    echo "Checking disk: $disk"
+    # If the disk is an Amazon EC2 NVMe Instance Storage volume, then install btrfs onto that disk
+    if sudo fdisk -l "/dev/$disk" | grep -q "Amazon EC2 NVMe Instance Storage"; then
+        echo "Disk $disk is an Amazon EC2 NVMe Instance Storage"
+        sudo mkfs.ext4 /dev/$disk
+        mkdir -p /mnt/data${data_id}
+        sudo mount -t ext4 /dev/$disk /mnt/data${data_id}
+        sudo echo "/dev/$disk /mnt/data${data_id} auto noatime 0 0" | sudo tee -a /etc/fstab
+        sudo lsblk -f
+        data_id=$((data_id + 1))
+    else
+        echo "Disk $disk is not an Amazon EC2 NVMe Instance Storage volume"
+    fi
+done
+
+
+
+
+wget https://dl.influxdata.com/telegraf/releases/telegraf-1.34.1_linux_amd64.tar.gz
+tar zxvf ./telegraf-1.34.1_linux_amd64.tar.gz > /dev/null
+popd
+
+docker buildx build --load --platform "linux/amd64" -t gluten-images -f ./Dockfile.centos9-run .
+
diff --git a/dev/docker/perf/entrypoint.sh b/dev/docker/perf/entrypoint.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Prevent any errors from being silently ignored
+set -eo pipefail
+
+if [ -z "$JAVA_HOME" ]; then
+  JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}')
+fi
+
+SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
+for v in "${!SPARK_JAVA_OPT_@}"; do
+    SPARK_EXECUTOR_JAVA_OPTS+=( "${!v}" )
+done
+
+if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then
+  SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH"
+fi
+
+if ! [ -z "${PYSPARK_PYTHON+x}" ]; then
+    export PYSPARK_PYTHON
+fi
+if ! [ -z "${PYSPARK_DRIVER_PYTHON+x}" ]; then
+    export PYSPARK_DRIVER_PYTHON
+fi
+
+# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor.
+# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s.
+if [ -n "${HADOOP_HOME}"  ] && [ -z "${SPARK_DIST_CLASSPATH}"  ]; then
+  export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)"
+fi
+
+if ! [ -z "${HADOOP_CONF_DIR+x}" ]; then
+  SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH";
+fi
+
+if ! [ -z "${SPARK_CONF_DIR+x}" ]; then
+  SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH";
+elif ! [ -z "${SPARK_HOME+x}" ]; then
+  SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH";
+fi
+
+SPARK_MASTER_URL="spark://$(hostname):7077"
+
+# Change owner of files added on top of Spark installation
+sudo chown -R spark:spark "${SPARK_HOME}"
+
+
+${SPARK_HOME}/sbin/start-master.sh
+${SPARK_HOME}/sbin/start-worker.sh "$SPARK_MASTER_URL"
+
+jupyter notebook --ip=0.0.0.0 --port=8989 --no-browser --notebook-dir=/opt/spark/work-dir/ipython &
+/usr/bin/telegraf &
+
+sleep infinity