Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions dev/docker/perf/Dockfile.centos9-run
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
FROM quay.io/centos/centos:stream9

ARG ARG_SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
ARG ARG_SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz.asc
ARG ARG_GPG_KEY=D76E23B9F11B5BF6864613C4F7051850A0AF904D
ARG ARG_ICEBERG_URL=https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.5.0/iceberg-spark-runtime-3.5_2.12-1.5.0.jar

COPY requirements.txt /opt/


RUN set -ex; \
useradd spark; \
dnf install -y bash vim libssh gnupg2 hostname java-17-openjdk less net-tools procps python3 sudo util-linux wget which pip; \
dnf install -y epel-release; \
dnf install -y jemalloc; \
ln -s $(which python3) /usr/bin/python; \
pip install -r /opt/requirements.txt; \
python3 -m pip install --upgrade jsonschema; \
rm -rf /opt/requirements.txt; \
dnf clean all; \
mkdir -p /opt/spark/work-dir; \
mkdir -p /opt/spark/work-dir/ipython/output; \
mkdir -p /opt/spark/work-dir/ipython/result; \
mkdir -p /opt/spark/work-dir/ipython/analysis; \
chmod g+w /opt/spark/work-dir; \
mkdir -p /opt/spark/events; \
chmod g+w /opt/spark/events; \
export SPARK_TMP="$(mktemp -d)"; \
cd $SPARK_TMP; \
wget -nv -O spark.tgz "$ARG_SPARK_TGZ_URL"; \
wget -nv -O spark.tgz.asc "$ARG_SPARK_TGZ_ASC_URL"; \
export GNUPGHOME="$(mktemp -d)"; \
gpg --batch --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || gpg --batch --keyserver hkps://keyserver.ubuntu.com --recv-keys "$ARG_GPG_KEY"; \
gpg --batch --verify spark.tgz.asc spark.tgz; \
gpgconf --kill all; \
rm -rf "$GNUPGHOME" spark.tgz.asc; \
tar -xf spark.tgz --strip-components=1; \
chown -R spark:spark .; \
mv RELEASE /opt/spark/; \
mv bin /opt/spark/; \
mv conf /opt/spark/; \
mv data /opt/spark/; \
mv examples /opt/spark/; \
mv jars /opt/spark/; \
mv python /opt/spark/; \
mv sbin /opt/spark/; \
cd ..; \
rm -rf "$SPARK_TMP"; \
cd /opt/spark/jars; \
wget -nv https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar; \
wget -nv https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar; \
wget -nv $ARG_ICEBERG_URL; \
cd /opt/spark; \
chown -R spark:spark .; \
echo 'spark ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers; \
echo 'alias ll="ls -alh"' >> /etc/bashrc


COPY entrypoint.sh /opt/
COPY ipython /opt/spark/work-dir/ipython
COPY telegraf.conf /etc/telegraf/telegraf.conf
COPY spark-defaults.conf /opt/spark/conf//spark-defaults.conf

RUN echo "/usr/lib64/gluten-libs" > /etc/ld.so.conf.d/gluten.conf && ldconfig

ENV SPARK_HOME=/opt/spark \
PATH=$SPARK_HOME/bin:$PATH

WORKDIR /opt/spark/work-dir

USER spark
RUN jupyter notebook --generate-config; \
mkdir -p ~/.jupyter/custom/; \
echo '.container { width:100% !important; }' >> ~/.jupyter/custom/custom.css; \
echo 'div.output_stderr { background: #ffdd; display: none; }' >> ~/.jupyter/custom/custom.css; \
echo '.CodeMirror{font-family: "Courier New";font-size: 12pt;}' >> ~/.jupyter/custom/custom.css; \
echo '.rendered_html table, .rendered_html td, .rendered_html th {font-size: 20px;}' >> ~/.jupyter/custom/custom.css; \
echo '{ "NotebookApp": { "password": "argon2:$argon2id$v=19$m=10240,t=10,p=8$JL+YUwQ5lsg9rFXeCpBfgQ$Xv2GOkdfxoe41FaedCHVAOvxisbd9i3Xjn+KkDtBD44" } } ' > ~/.jupyter/jupyter_notebook_config.json; \
jupyter nbextension install --py jupyter_highlight_selected_word --user; \
jupyter nbextension enable highlight_selected_word/main; \
jupyter nbextension install --py widgetsnbextension --user; \
jupyter contrib nbextension install --user; \
jupyter nbextension enable codefolding/main; \
jupyter nbextension enable code_prettify/code_prettify; \
jupyter nbextension enable codefolding/edit; \
jupyter nbextension enable code_font_size/code_font_size; \
jupyter nbextension enable collapsible_headings/main; \
jupyter nbextension enable highlight_selected_word/main; \
jupyter nbextension enable ipyparallel/main; \
jupyter nbextension enable move_selected_cells/main; \
jupyter nbextension enable nbTranslate/main; \
jupyter nbextension enable scratchpad/main; \
jupyter nbextension enable tree-filter/index; \
jupyter nbextension enable comment-uncomment/main; \
jupyter nbextension enable export_embedded/main; \
jupyter nbextension enable hide_header/main; \
jupyter nbextension enable highlighter/highlighter; \
jupyter nbextension enable scroll_down/main; \
jupyter nbextension enable snippets/main; \
jupyter nbextension enable toc2/main; \
jupyter nbextension enable varInspector/main; \
jupyter nbextension enable codefolding/edit; \
jupyter nbextension enable contrib_nbextensions_help_item/main; \
jupyter nbextension enable freeze/main; \
jupyter nbextension enable hide_input/main; \
jupyter nbextension enable jupyter-js-widgets/extension; \
jupyter nbextension enable snippets_menu/main; \
jupyter nbextension enable table_beautifier/main; \
jupyter nbextension enable hide_input_all/main; \
jupyter nbextension enable spellchecker/main; \
jupyter nbextension enable toggle_all_line_numbers/main; \
jupyter nbextensions_configurator enable --user;


ENTRYPOINT ["/opt/entrypoint.sh"]
32 changes: 32 additions & 0 deletions dev/docker/perf/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
pushd ~
mkdir jars
mkdir shared-libs
mkdir tpch_sf100_parquet_zstd

disk_names=( $(sudo lsblk -d -o NAME | tail -n +2) )
data_id=1
for disk in "${disk_names[@]}"; do
echo "Checking disk: $disk"
# If the disk is an Amazon EC2 NVMe Instance Storage volume, then install btrfs onto that disk
if sudo fdisk -l "/dev/$disk" | grep -q "Amazon EC2 NVMe Instance Storage"; then
echo "Disk $disk is an Amazon EC2 NVMe Instance Storage"
sudo mkfs.ext4 /dev/$disk
mkdir -p /mnt/data${data_id}
sudo mount -t ext4 /dev/$disk /mnt/data${data_id}
sudo echo "/dev/$disk /mnt/data${data_id} auto noatime 0 0" | sudo tee -a /etc/fstab
sudo lsblk -f
data_id=$((data_id + 1))
else
echo "Disk $disk is not an Amazon EC2 NVMe Instance Storage volume"
fi
done




wget https://dl.influxdata.com/telegraf/releases/telegraf-1.34.1_linux_amd64.tar.gz
tar zxvf ./telegraf-1.34.1_linux_amd64.tar.gz > /dev/null
popd

docker buildx build --load --platform "linux/amd64" -t gluten-images -f ./Dockfile.centos9-run .

69 changes: 69 additions & 0 deletions dev/docker/perf/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Prevent any errors from being silently ignored
set -eo pipefail

if [ -z "$JAVA_HOME" ]; then
JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}')
fi

SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
for v in "${!SPARK_JAVA_OPT_@}"; do
SPARK_EXECUTOR_JAVA_OPTS+=( "${!v}" )
done

if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then
SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH"
fi

if ! [ -z "${PYSPARK_PYTHON+x}" ]; then
export PYSPARK_PYTHON
fi
if ! [ -z "${PYSPARK_DRIVER_PYTHON+x}" ]; then
export PYSPARK_DRIVER_PYTHON
fi

# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor.
# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s.
if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then
export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)"
fi

if ! [ -z "${HADOOP_CONF_DIR+x}" ]; then
SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH";
fi

if ! [ -z "${SPARK_CONF_DIR+x}" ]; then
SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH";
elif ! [ -z "${SPARK_HOME+x}" ]; then
SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH";
fi

SPARK_MASTER_URL="spark://$(hostname):7077"

# Change owner of files added on top of Spark installation
sudo chown -R spark:spark "${SPARK_HOME}"


${SPARK_HOME}/sbin/start-master.sh
${SPARK_HOME}/sbin/start-worker.sh "$SPARK_MASTER_URL"

jupyter notebook --ip=0.0.0.0 --port=8989 --no-browser --notebook-dir=/opt/spark/work-dir/ipython &
/usr/bin/telegraf &

sleep infinity
Loading
Loading