Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(spark): add spark dockerbit #551

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ DOCKER-STACKS-UPSTREAM-TAG := ed2908bbb62e
tensorflow-CUDA := 11.8.0
pytorch-CUDA := 11.8.0

SPARK := main

# https://stackoverflow.com/questions/5917413/concatenate-multiple-files-but-include-filename-as-section-headers
CAT := awk '(FNR==1){print "\n\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n\#\#\# " FILENAME "\n\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n"}1'

Expand Down Expand Up @@ -64,7 +66,7 @@ generate-CUDA:
bash scripts/get-nvidia-stuff.sh $(pytorch-CUDA) > $(SRC)/1_CUDA-$(pytorch-CUDA).Dockerfile

generate-Spark:
bash scripts/get-spark-stuff.sh --commit $(COMMIT) > $(SRC)/2_Spark.Dockerfile
bash scripts/get-spark-stuff.sh --commit $(SPARK) > $(SRC)/2_Spark.Dockerfile

###################################
###### Dockerfile Management ######
Expand Down Expand Up @@ -148,6 +150,7 @@ jupyterlab: pytorch tensorflow cpu
cp -r resources/common/. $(OUT)/$@-$${type}/; \
$(CAT) \
$(TMP)/$${type}.Dockerfile \
$(SRC)/2_Spark.Dockerfile \
$(SRC)/3_Kubeflow.Dockerfile \
$(SRC)/4_CLI.Dockerfile \
$(SRC)/5_DB-Drivers.Dockerfile \
Expand Down
132 changes: 132 additions & 0 deletions docker-bits/2_Spark.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# Spark stuff

###########################
### pyspark-notebook
###########################
# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/pyspark-notebook/Dockerfile

# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
ARG REGISTRY=quay.io
ARG OWNER=jupyter


# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
SHELL ["/bin/bash", "-o", "pipefail", "-c"]

USER root

# Spark dependencies
# Default values can be overridden at build time
# (ARGS are in lower case to distinguish them from ENV)
ARG spark_version="3.5.0"
ARG hadoop_version="3"
ARG scala_version
ARG spark_checksum="8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319"
ARG openjdk_version="17"

ENV APACHE_SPARK_VERSION="${spark_version}" \
HADOOP_VERSION="${hadoop_version}"

RUN apt-get update --yes && \
apt-get install --yes --no-install-recommends \
"openjdk-${openjdk_version}-jre-headless" \
ca-certificates-java && \
apt-get clean && rm -rf /var/lib/apt/lists/*

# Spark installation
WORKDIR /tmp

# You need to use https://archive.apache.org/dist/ website if you want to download old Spark versions
# But it seems to be slower, that's why we use recommended site for download
RUN if [ -z "${scala_version}" ]; then \
curl --progress-bar --location --output "spark.tgz" \
"https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"; \
else \
curl --progress-bar --location --output "spark.tgz" \
"https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz"; \
fi && \
echo "${spark_checksum} *spark.tgz" | sha512sum -c - && \
tar xzf "spark.tgz" -C /usr/local --owner root --group root --no-same-owner && \
rm "spark.tgz"

# Configure Spark
ENV SPARK_HOME=/usr/local/spark
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
PATH="${PATH}:${SPARK_HOME}/bin"

RUN if [ -z "${scala_version}" ]; then \
ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}"; \
else \
ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}" "${SPARK_HOME}"; \
fi && \
# Add a link in the before_notebook hook in order to source automatically PYTHONPATH && \
ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh

# Configure IPython system-wide
COPY ipython_kernel_config.py "/etc/ipython/"
RUN fix-permissions "/etc/ipython/"

USER ${NB_UID}

# Install pyarrow
# NOTE: It's important to ensure compatibility between Pandas versions.
# The pandas version in this Dockerfile should match the version
# on which the Pandas API for Spark is built.
# To find the right version:
# 1. Check out the Spark branch you are on.
# 2. Find the pandas version in the file spark/dev/infra/Dockerfile.
RUN mamba install --yes \
'grpcio-status' \
'grpcio' \
'pandas=2.0.3' \
'pyarrow' && \
mamba clean --all -f -y && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"

WORKDIR "${HOME}"
EXPOSE 4040

###########################
### all-spark-notebook
###########################
# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/all-spark-notebook/Dockerfile

# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
ARG REGISTRY=quay.io
ARG OWNER=jupyter


# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
SHELL ["/bin/bash", "-o", "pipefail", "-c"]

USER root

# RSpark config
ENV R_LIBS_USER "${SPARK_HOME}/R/lib"
RUN fix-permissions "${R_LIBS_USER}"

# R pre-requisites
RUN apt-get update --yes && \
apt-get install --yes --no-install-recommends \
fonts-dejavu \
gfortran \
gcc && \
apt-get clean && rm -rf /var/lib/apt/lists/*

USER ${NB_UID}

# R packages including IRKernel which gets installed globally.
RUN mamba install --yes \
'r-base' \
'r-ggplot2' \
'r-irkernel' \
'r-rcurl' \
'r-sparklyr' && \
mamba clean --all -f -y && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"
137 changes: 137 additions & 0 deletions output/jupyterlab-cpu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,143 @@ RUN apt-get update --yes \
&& rm -rf /var/lib/apt/lists/* \
&& chmod +x /usr/bin/clean-layer.sh

###############################
### docker-bits/2_Spark.Dockerfile
###############################

# Spark stuff

###########################
### pyspark-notebook
###########################
# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/pyspark-notebook/Dockerfile

# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
ARG REGISTRY=quay.io
ARG OWNER=jupyter


# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
SHELL ["/bin/bash", "-o", "pipefail", "-c"]

USER root

# Spark dependencies
# Default values can be overridden at build time
# (ARGS are in lower case to distinguish them from ENV)
ARG spark_version="3.5.0"
ARG hadoop_version="3"
ARG scala_version
ARG spark_checksum="8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319"
ARG openjdk_version="17"

ENV APACHE_SPARK_VERSION="${spark_version}" \
HADOOP_VERSION="${hadoop_version}"

RUN apt-get update --yes && \
apt-get install --yes --no-install-recommends \
"openjdk-${openjdk_version}-jre-headless" \
ca-certificates-java && \
apt-get clean && rm -rf /var/lib/apt/lists/*

# Spark installation
WORKDIR /tmp

# You need to use https://archive.apache.org/dist/ website if you want to download old Spark versions
# But it seems to be slower, that's why we use recommended site for download
RUN if [ -z "${scala_version}" ]; then \
curl --progress-bar --location --output "spark.tgz" \
"https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"; \
else \
curl --progress-bar --location --output "spark.tgz" \
"https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz"; \
fi && \
echo "${spark_checksum} *spark.tgz" | sha512sum -c - && \
tar xzf "spark.tgz" -C /usr/local --owner root --group root --no-same-owner && \
rm "spark.tgz"

# Configure Spark
ENV SPARK_HOME=/usr/local/spark
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
PATH="${PATH}:${SPARK_HOME}/bin"

RUN if [ -z "${scala_version}" ]; then \
ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}"; \
else \
ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}" "${SPARK_HOME}"; \
fi && \
# Add a link in the before_notebook hook in order to source automatically PYTHONPATH && \
ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh

# Configure IPython system-wide
COPY ipython_kernel_config.py "/etc/ipython/"
RUN fix-permissions "/etc/ipython/"

USER ${NB_UID}

# Install pyarrow
# NOTE: It's important to ensure compatibility between Pandas versions.
# The pandas version in this Dockerfile should match the version
# on which the Pandas API for Spark is built.
# To find the right version:
# 1. Check out the Spark branch you are on.
# 2. Find the pandas version in the file spark/dev/infra/Dockerfile.
RUN mamba install --yes \
'grpcio-status' \
'grpcio' \
'pandas=2.0.3' \
'pyarrow' && \
mamba clean --all -f -y && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"

WORKDIR "${HOME}"
EXPOSE 4040

###########################
### all-spark-notebook
###########################
# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/all-spark-notebook/Dockerfile

# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
ARG REGISTRY=quay.io
ARG OWNER=jupyter


# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
SHELL ["/bin/bash", "-o", "pipefail", "-c"]

USER root

# RSpark config
ENV R_LIBS_USER "${SPARK_HOME}/R/lib"
RUN fix-permissions "${R_LIBS_USER}"

# R pre-requisites
RUN apt-get update --yes && \
apt-get install --yes --no-install-recommends \
fonts-dejavu \
gfortran \
gcc && \
apt-get clean && rm -rf /var/lib/apt/lists/*

USER ${NB_UID}

# R packages including IRKernel which gets installed globally.
RUN mamba install --yes \
'r-base' \
'r-ggplot2' \
'r-irkernel' \
'r-rcurl' \
'r-sparklyr' && \
mamba clean --all -f -y && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"

###############################
### docker-bits/3_Kubeflow.Dockerfile
###############################
Expand Down
Loading
Loading