From 7c69f9c4f982d1b8f8881f7845fea3a9fd115cec Mon Sep 17 00:00:00 2001 From: souheil-yazji Date: Tue, 14 Nov 2023 14:57:54 +0000 Subject: [PATCH] feat(spark): add spark dockerbit --- Makefile | 5 +- docker-bits/2_Spark.Dockerfile | 132 +++++++++++++++++++++++ output/jupyterlab-cpu/Dockerfile | 137 ++++++++++++++++++++++++ output/jupyterlab-pytorch/Dockerfile | 137 ++++++++++++++++++++++++ output/jupyterlab-tensorflow/Dockerfile | 137 ++++++++++++++++++++++++ scripts/get-spark-stuff.sh | 4 +- 6 files changed, 549 insertions(+), 3 deletions(-) create mode 100644 docker-bits/2_Spark.Dockerfile diff --git a/Makefile b/Makefile index bb5fa1b2e..24fdc8d5e 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,8 @@ DOCKER-STACKS-UPSTREAM-TAG := ed2908bbb62e tensorflow-CUDA := 11.8.0 pytorch-CUDA := 11.8.0 +SPARK := main + # https://stackoverflow.com/questions/5917413/concatenate-multiple-files-but-include-filename-as-section-headers CAT := awk '(FNR==1){print "\n\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n\#\#\# " FILENAME "\n\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n"}1' @@ -64,7 +66,7 @@ generate-CUDA: bash scripts/get-nvidia-stuff.sh $(pytorch-CUDA) > $(SRC)/1_CUDA-$(pytorch-CUDA).Dockerfile generate-Spark: - bash scripts/get-spark-stuff.sh --commit $(COMMIT) > $(SRC)/2_Spark.Dockerfile + bash scripts/get-spark-stuff.sh --commit $(SPARK) > $(SRC)/2_Spark.Dockerfile ################################### ###### Dockerfile Management ###### @@ -148,6 +150,7 @@ jupyterlab: pytorch tensorflow cpu cp -r resources/common/. $(OUT)/$@-$${type}/; \ $(CAT) \ $(TMP)/$${type}.Dockerfile \ + $(SRC)/2_Spark.Dockerfile \ $(SRC)/3_Kubeflow.Dockerfile \ $(SRC)/4_CLI.Dockerfile \ $(SRC)/5_DB-Drivers.Dockerfile \ diff --git a/docker-bits/2_Spark.Dockerfile b/docker-bits/2_Spark.Dockerfile new file mode 100644 index 000000000..80d172b2f --- /dev/null +++ b/docker-bits/2_Spark.Dockerfile @@ -0,0 +1,132 @@ +# Spark stuff + +########################### +### pyspark-notebook +########################### +# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/pyspark-notebook/Dockerfile + +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. +ARG REGISTRY=quay.io +ARG OWNER=jupyter + + +# Fix: https://github.com/hadolint/hadolint/wiki/DL4006 +# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014 +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +USER root + +# Spark dependencies +# Default values can be overridden at build time +# (ARGS are in lower case to distinguish them from ENV) +ARG spark_version="3.5.0" +ARG hadoop_version="3" +ARG scala_version +ARG spark_checksum="8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319" +ARG openjdk_version="17" + +ENV APACHE_SPARK_VERSION="${spark_version}" \ + HADOOP_VERSION="${hadoop_version}" + +RUN apt-get update --yes && \ + apt-get install --yes --no-install-recommends \ + "openjdk-${openjdk_version}-jre-headless" \ + ca-certificates-java && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Spark installation +WORKDIR /tmp + +# You need to use https://archive.apache.org/dist/ website if you want to download old Spark versions +# But it seems to be slower, that's why we use recommended site for download +RUN if [ -z "${scala_version}" ]; then \ + curl --progress-bar --location --output "spark.tgz" \ + "https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"; \ + else \ + curl --progress-bar --location --output "spark.tgz" \ + "https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz"; \ + fi && \ + echo "${spark_checksum} *spark.tgz" | sha512sum -c - && \ + tar xzf "spark.tgz" -C /usr/local --owner root --group root --no-same-owner && \ + rm "spark.tgz" + +# Configure Spark +ENV SPARK_HOME=/usr/local/spark +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \ + PATH="${PATH}:${SPARK_HOME}/bin" + +RUN if [ -z "${scala_version}" ]; then \ + ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}"; \ + else \ + ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}" "${SPARK_HOME}"; \ + fi && \ + # Add a link in the before_notebook hook in order to source automatically PYTHONPATH && \ + ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh + +# Configure IPython system-wide +COPY ipython_kernel_config.py "/etc/ipython/" +RUN fix-permissions "/etc/ipython/" + +USER ${NB_UID} + +# Install pyarrow +# NOTE: It's important to ensure compatibility between Pandas versions. +# The pandas version in this Dockerfile should match the version +# on which the Pandas API for Spark is built. +# To find the right version: +# 1. Check out the Spark branch you are on. +# 2. Find the pandas version in the file spark/dev/infra/Dockerfile. +RUN mamba install --yes \ + 'grpcio-status' \ + 'grpcio' \ + 'pandas=2.0.3' \ + 'pyarrow' && \ + mamba clean --all -f -y && \ + fix-permissions "${CONDA_DIR}" && \ + fix-permissions "/home/${NB_USER}" + +WORKDIR "${HOME}" +EXPOSE 4040 + +########################### +### all-spark-notebook +########################### +# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/all-spark-notebook/Dockerfile + +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. +ARG REGISTRY=quay.io +ARG OWNER=jupyter + + +# Fix: https://github.com/hadolint/hadolint/wiki/DL4006 +# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014 +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +USER root + +# RSpark config +ENV R_LIBS_USER "${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" + +# R pre-requisites +RUN apt-get update --yes && \ + apt-get install --yes --no-install-recommends \ + fonts-dejavu \ + gfortran \ + gcc && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +USER ${NB_UID} + +# R packages including IRKernel which gets installed globally. +RUN mamba install --yes \ + 'r-base' \ + 'r-ggplot2' \ + 'r-irkernel' \ + 'r-rcurl' \ + 'r-sparklyr' && \ + mamba clean --all -f -y && \ + fix-permissions "${CONDA_DIR}" && \ + fix-permissions "/home/${NB_USER}" diff --git a/output/jupyterlab-cpu/Dockerfile b/output/jupyterlab-cpu/Dockerfile index e0f7775ce..bc37b4ac6 100644 --- a/output/jupyterlab-cpu/Dockerfile +++ b/output/jupyterlab-cpu/Dockerfile @@ -28,6 +28,143 @@ RUN apt-get update --yes \ && rm -rf /var/lib/apt/lists/* \ && chmod +x /usr/bin/clean-layer.sh +############################### +### docker-bits/2_Spark.Dockerfile +############################### + +# Spark stuff + +########################### +### pyspark-notebook +########################### +# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/pyspark-notebook/Dockerfile + +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. +ARG REGISTRY=quay.io +ARG OWNER=jupyter + + +# Fix: https://github.com/hadolint/hadolint/wiki/DL4006 +# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014 +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +USER root + +# Spark dependencies +# Default values can be overridden at build time +# (ARGS are in lower case to distinguish them from ENV) +ARG spark_version="3.5.0" +ARG hadoop_version="3" +ARG scala_version +ARG spark_checksum="8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319" +ARG openjdk_version="17" + +ENV APACHE_SPARK_VERSION="${spark_version}" \ + HADOOP_VERSION="${hadoop_version}" + +RUN apt-get update --yes && \ + apt-get install --yes --no-install-recommends \ + "openjdk-${openjdk_version}-jre-headless" \ + ca-certificates-java && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Spark installation +WORKDIR /tmp + +# You need to use https://archive.apache.org/dist/ website if you want to download old Spark versions +# But it seems to be slower, that's why we use recommended site for download +RUN if [ -z "${scala_version}" ]; then \ + curl --progress-bar --location --output "spark.tgz" \ + "https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"; \ + else \ + curl --progress-bar --location --output "spark.tgz" \ + "https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz"; \ + fi && \ + echo "${spark_checksum} *spark.tgz" | sha512sum -c - && \ + tar xzf "spark.tgz" -C /usr/local --owner root --group root --no-same-owner && \ + rm "spark.tgz" + +# Configure Spark +ENV SPARK_HOME=/usr/local/spark +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \ + PATH="${PATH}:${SPARK_HOME}/bin" + +RUN if [ -z "${scala_version}" ]; then \ + ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}"; \ + else \ + ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}" "${SPARK_HOME}"; \ + fi && \ + # Add a link in the before_notebook hook in order to source automatically PYTHONPATH && \ + ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh + +# Configure IPython system-wide +COPY ipython_kernel_config.py "/etc/ipython/" +RUN fix-permissions "/etc/ipython/" + +USER ${NB_UID} + +# Install pyarrow +# NOTE: It's important to ensure compatibility between Pandas versions. +# The pandas version in this Dockerfile should match the version +# on which the Pandas API for Spark is built. +# To find the right version: +# 1. Check out the Spark branch you are on. +# 2. Find the pandas version in the file spark/dev/infra/Dockerfile. +RUN mamba install --yes \ + 'grpcio-status' \ + 'grpcio' \ + 'pandas=2.0.3' \ + 'pyarrow' && \ + mamba clean --all -f -y && \ + fix-permissions "${CONDA_DIR}" && \ + fix-permissions "/home/${NB_USER}" + +WORKDIR "${HOME}" +EXPOSE 4040 + +########################### +### all-spark-notebook +########################### +# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/all-spark-notebook/Dockerfile + +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. +ARG REGISTRY=quay.io +ARG OWNER=jupyter + + +# Fix: https://github.com/hadolint/hadolint/wiki/DL4006 +# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014 +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +USER root + +# RSpark config +ENV R_LIBS_USER "${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" + +# R pre-requisites +RUN apt-get update --yes && \ + apt-get install --yes --no-install-recommends \ + fonts-dejavu \ + gfortran \ + gcc && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +USER ${NB_UID} + +# R packages including IRKernel which gets installed globally. +RUN mamba install --yes \ + 'r-base' \ + 'r-ggplot2' \ + 'r-irkernel' \ + 'r-rcurl' \ + 'r-sparklyr' && \ + mamba clean --all -f -y && \ + fix-permissions "${CONDA_DIR}" && \ + fix-permissions "/home/${NB_USER}" + ############################### ### docker-bits/3_Kubeflow.Dockerfile ############################### diff --git a/output/jupyterlab-pytorch/Dockerfile b/output/jupyterlab-pytorch/Dockerfile index 65676c79f..703ed0c6a 100644 --- a/output/jupyterlab-pytorch/Dockerfile +++ b/output/jupyterlab-pytorch/Dockerfile @@ -50,6 +50,143 @@ RUN mamba create -n torch && \ python -m ipykernel install --user --name torch --display-name "PyTorch" +############################### +### docker-bits/2_Spark.Dockerfile +############################### + +# Spark stuff + +########################### +### pyspark-notebook +########################### +# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/pyspark-notebook/Dockerfile + +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. +ARG REGISTRY=quay.io +ARG OWNER=jupyter + + +# Fix: https://github.com/hadolint/hadolint/wiki/DL4006 +# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014 +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +USER root + +# Spark dependencies +# Default values can be overridden at build time +# (ARGS are in lower case to distinguish them from ENV) +ARG spark_version="3.5.0" +ARG hadoop_version="3" +ARG scala_version +ARG spark_checksum="8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319" +ARG openjdk_version="17" + +ENV APACHE_SPARK_VERSION="${spark_version}" \ + HADOOP_VERSION="${hadoop_version}" + +RUN apt-get update --yes && \ + apt-get install --yes --no-install-recommends \ + "openjdk-${openjdk_version}-jre-headless" \ + ca-certificates-java && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Spark installation +WORKDIR /tmp + +# You need to use https://archive.apache.org/dist/ website if you want to download old Spark versions +# But it seems to be slower, that's why we use recommended site for download +RUN if [ -z "${scala_version}" ]; then \ + curl --progress-bar --location --output "spark.tgz" \ + "https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"; \ + else \ + curl --progress-bar --location --output "spark.tgz" \ + "https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz"; \ + fi && \ + echo "${spark_checksum} *spark.tgz" | sha512sum -c - && \ + tar xzf "spark.tgz" -C /usr/local --owner root --group root --no-same-owner && \ + rm "spark.tgz" + +# Configure Spark +ENV SPARK_HOME=/usr/local/spark +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \ + PATH="${PATH}:${SPARK_HOME}/bin" + +RUN if [ -z "${scala_version}" ]; then \ + ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}"; \ + else \ + ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}" "${SPARK_HOME}"; \ + fi && \ + # Add a link in the before_notebook hook in order to source automatically PYTHONPATH && \ + ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh + +# Configure IPython system-wide +COPY ipython_kernel_config.py "/etc/ipython/" +RUN fix-permissions "/etc/ipython/" + +USER ${NB_UID} + +# Install pyarrow +# NOTE: It's important to ensure compatibility between Pandas versions. +# The pandas version in this Dockerfile should match the version +# on which the Pandas API for Spark is built. +# To find the right version: +# 1. Check out the Spark branch you are on. +# 2. Find the pandas version in the file spark/dev/infra/Dockerfile. +RUN mamba install --yes \ + 'grpcio-status' \ + 'grpcio' \ + 'pandas=2.0.3' \ + 'pyarrow' && \ + mamba clean --all -f -y && \ + fix-permissions "${CONDA_DIR}" && \ + fix-permissions "/home/${NB_USER}" + +WORKDIR "${HOME}" +EXPOSE 4040 + +########################### +### all-spark-notebook +########################### +# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/all-spark-notebook/Dockerfile + +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. +ARG REGISTRY=quay.io +ARG OWNER=jupyter + + +# Fix: https://github.com/hadolint/hadolint/wiki/DL4006 +# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014 +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +USER root + +# RSpark config +ENV R_LIBS_USER "${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" + +# R pre-requisites +RUN apt-get update --yes && \ + apt-get install --yes --no-install-recommends \ + fonts-dejavu \ + gfortran \ + gcc && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +USER ${NB_UID} + +# R packages including IRKernel which gets installed globally. +RUN mamba install --yes \ + 'r-base' \ + 'r-ggplot2' \ + 'r-irkernel' \ + 'r-rcurl' \ + 'r-sparklyr' && \ + mamba clean --all -f -y && \ + fix-permissions "${CONDA_DIR}" && \ + fix-permissions "/home/${NB_USER}" + ############################### ### docker-bits/3_Kubeflow.Dockerfile ############################### diff --git a/output/jupyterlab-tensorflow/Dockerfile b/output/jupyterlab-tensorflow/Dockerfile index d6c0ebfc6..c6a09d219 100644 --- a/output/jupyterlab-tensorflow/Dockerfile +++ b/output/jupyterlab-tensorflow/Dockerfile @@ -157,6 +157,143 @@ RUN mamba install --quiet --yes \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER +############################### +### docker-bits/2_Spark.Dockerfile +############################### + +# Spark stuff + +########################### +### pyspark-notebook +########################### +# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/pyspark-notebook/Dockerfile + +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. +ARG REGISTRY=quay.io +ARG OWNER=jupyter + + +# Fix: https://github.com/hadolint/hadolint/wiki/DL4006 +# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014 +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +USER root + +# Spark dependencies +# Default values can be overridden at build time +# (ARGS are in lower case to distinguish them from ENV) +ARG spark_version="3.5.0" +ARG hadoop_version="3" +ARG scala_version +ARG spark_checksum="8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319" +ARG openjdk_version="17" + +ENV APACHE_SPARK_VERSION="${spark_version}" \ + HADOOP_VERSION="${hadoop_version}" + +RUN apt-get update --yes && \ + apt-get install --yes --no-install-recommends \ + "openjdk-${openjdk_version}-jre-headless" \ + ca-certificates-java && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Spark installation +WORKDIR /tmp + +# You need to use https://archive.apache.org/dist/ website if you want to download old Spark versions +# But it seems to be slower, that's why we use recommended site for download +RUN if [ -z "${scala_version}" ]; then \ + curl --progress-bar --location --output "spark.tgz" \ + "https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"; \ + else \ + curl --progress-bar --location --output "spark.tgz" \ + "https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz"; \ + fi && \ + echo "${spark_checksum} *spark.tgz" | sha512sum -c - && \ + tar xzf "spark.tgz" -C /usr/local --owner root --group root --no-same-owner && \ + rm "spark.tgz" + +# Configure Spark +ENV SPARK_HOME=/usr/local/spark +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \ + PATH="${PATH}:${SPARK_HOME}/bin" + +RUN if [ -z "${scala_version}" ]; then \ + ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}"; \ + else \ + ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}" "${SPARK_HOME}"; \ + fi && \ + # Add a link in the before_notebook hook in order to source automatically PYTHONPATH && \ + ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh + +# Configure IPython system-wide +COPY ipython_kernel_config.py "/etc/ipython/" +RUN fix-permissions "/etc/ipython/" + +USER ${NB_UID} + +# Install pyarrow +# NOTE: It's important to ensure compatibility between Pandas versions. +# The pandas version in this Dockerfile should match the version +# on which the Pandas API for Spark is built. +# To find the right version: +# 1. Check out the Spark branch you are on. +# 2. Find the pandas version in the file spark/dev/infra/Dockerfile. +RUN mamba install --yes \ + 'grpcio-status' \ + 'grpcio' \ + 'pandas=2.0.3' \ + 'pyarrow' && \ + mamba clean --all -f -y && \ + fix-permissions "${CONDA_DIR}" && \ + fix-permissions "/home/${NB_USER}" + +WORKDIR "${HOME}" +EXPOSE 4040 + +########################### +### all-spark-notebook +########################### +# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/all-spark-notebook/Dockerfile + +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. +ARG REGISTRY=quay.io +ARG OWNER=jupyter + + +# Fix: https://github.com/hadolint/hadolint/wiki/DL4006 +# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014 +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +USER root + +# RSpark config +ENV R_LIBS_USER "${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" + +# R pre-requisites +RUN apt-get update --yes && \ + apt-get install --yes --no-install-recommends \ + fonts-dejavu \ + gfortran \ + gcc && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +USER ${NB_UID} + +# R packages including IRKernel which gets installed globally. +RUN mamba install --yes \ + 'r-base' \ + 'r-ggplot2' \ + 'r-irkernel' \ + 'r-rcurl' \ + 'r-sparklyr' && \ + mamba clean --all -f -y && \ + fix-permissions "${CONDA_DIR}" && \ + fix-permissions "/home/${NB_USER}" + ############################### ### docker-bits/3_Kubeflow.Dockerfile ############################### diff --git a/scripts/get-spark-stuff.sh b/scripts/get-spark-stuff.sh index 75bf5184a..b8a57d203 100755 --- a/scripts/get-spark-stuff.sh +++ b/scripts/get-spark-stuff.sh @@ -53,9 +53,9 @@ get_file () { ########################### ### $FILE ########################### -# https://raw.githubusercontent.com/jupyter/docker-stacks/$COMMIT/$FILE/Dockerfile +# https://raw.githubusercontent.com/jupyter/docker-stacks/$COMMIT/images/$FILE/Dockerfile -$(curl -s https://raw.githubusercontent.com/jupyter/docker-stacks/$COMMIT/$FILE/Dockerfile) +$(curl -s https://raw.githubusercontent.com/jupyter/docker-stacks/$COMMIT/images/$FILE/Dockerfile) EOF }