From d656920e59d8834e147d1d472b61d869e3197068 Mon Sep 17 00:00:00 2001 From: Armando Miani Date: Tue, 19 Sep 2023 06:14:11 +0200 Subject: [PATCH] [airflow] Docker images updates - 2023-09-14-123814 (#2432) --- docker/airflow/2/debian11/2.7/Dockerfile | 265 +++++++++++++++ .../2.7/scripts/docker/install_airflow.sh | 16 + .../2.7/scripts/docker/install_mssql.sh | 24 ++ .../2.7/scripts/docker/install_mysql.sh | 54 +++ .../2.7/scripts/docker/install_pip_version.sh | 13 + .../2/debian11/2.7/scripts/prod/clean-logs.sh | 40 +++ .../2.7/scripts/prod/entrypoint_prod.sh | 318 ++++++++++++++++++ docker/airflow/versions.yaml | 25 +- 8 files changed, 751 insertions(+), 4 deletions(-) create mode 100644 docker/airflow/2/debian11/2.7/Dockerfile create mode 100755 docker/airflow/2/debian11/2.7/scripts/docker/install_airflow.sh create mode 100755 docker/airflow/2/debian11/2.7/scripts/docker/install_mssql.sh create mode 100755 docker/airflow/2/debian11/2.7/scripts/docker/install_mysql.sh create mode 100755 docker/airflow/2/debian11/2.7/scripts/docker/install_pip_version.sh create mode 100755 docker/airflow/2/debian11/2.7/scripts/prod/clean-logs.sh create mode 100755 docker/airflow/2/debian11/2.7/scripts/prod/entrypoint_prod.sh diff --git a/docker/airflow/2/debian11/2.7/Dockerfile b/docker/airflow/2/debian11/2.7/Dockerfile new file mode 100644 index 0000000000..898259b366 --- /dev/null +++ b/docker/airflow/2/debian11/2.7/Dockerfile @@ -0,0 +1,265 @@ +ARG AIRFLOW_VERSION="2.7.1" +ARG AIRFLOW_EXTRAS="amazon,async,celery,cncf.kubernetes,dask,docker,elasticsearch,ftp,google,google_auth,grpc,hashicorp,http,ldap,microsoft.azure,mysql,odbc,pandas,postgres,redis,sendgrid,sftp,slack,ssh,statsd,virtualenv" + +ARG AIRFLOW_HOME=/opt/airflow +ARG AIRFLOW_UID="50000" +ARG AIRFLOW_USER_HOME_DIR=/home/airflow +ARG AIRFLOW_PIP_VERSION="22.3.1" + +FROM marketplace.gcr.io/google/c2d-debian11 as airflow-build-image + +SHELL ["/bin/bash", "-o", "pipefail", "-o", "errexit", "-o", "nounset", "-o", "nolog", "-c"] +ENV DEBIAN_FRONTEND=noninteractive LANGUAGE=C.UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 \ + LC_CTYPE=C.UTF-8 LC_MESSAGES=C.UTF-8 + +ENV DEV_APT_DEPS="\ + apt-transport-https \ + apt-utils \ + build-essential \ + ca-certificates \ + dirmngr \ + freetds-bin \ + freetds-dev \ + gosu \ + krb5-user \ + ldap-utils \ + libffi-dev \ + libkrb5-dev \ + libldap2-dev \ + libpq-dev \ + libsasl2-2 \ + libsasl2-dev \ + libsasl2-modules \ + libssl-dev \ + locales \ + lsb-release \ + nodejs \ + openssh-client \ + pkg-config \ + postgresql-client \ + python3 \ + python3-pip \ + python3-dev \ + sasl2-bin \ + software-properties-common \ + sqlite3 \ + sudo \ + unixodbc \ + unixodbc-dev \ + yarn" + +ENV DEV_APT_COMMAND="\ + curl --silent --fail --location https://deb.nodesource.com/setup_14.x | \ + bash -o pipefail -o errexit -o nolog - \ + && curl --silent https://dl.yarnpkg.com/debian/pubkey.gpg | \ + apt-key add - >/dev/null 2>&1\ + && echo 'deb https://dl.yarnpkg.com/debian/ stable main' > /etc/apt/sources.list.d/yarn.list" + +RUN apt-get update \ + && apt-get install --no-install-recommends -yqq apt-utils >/dev/null 2>&1 \ + && apt-get install -y --no-install-recommends curl gnupg2 \ + && mkdir -pv /usr/share/man/man1 \ + && mkdir -pv /usr/share/man/man7 \ + && bash -o pipefail -o errexit -o nounset -o nolog -c "${DEV_APT_COMMAND}" \ + && apt-get update \ + && apt-get install -y --no-install-recommends ${DEV_APT_DEPS} \ + && apt-get autoremove -yqq --purge \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ARG AIRFLOW_EXTRAS +ARG AIRFLOW_VERSION +ARG AIRFLOW_PIP_VERSION +ARG AIRFLOW_HOME +ARG AIRFLOW_USER_HOME_DIR +ARG AIRFLOW_UID + +ENV AIRFLOW_PIP_VERSION=${AIRFLOW_PIP_VERSION} \ + AIRFLOW_VERSION=${AIRFLOW_VERSION} \ + AIRFLOW_EXTRAS=${AIRFLOW_EXTRAS} \ + AIRFLOW_PRE_CACHED_PIP_PACKAGES="false" \ + INSTALL_PROVIDERS_FROM_SOURCES="false" \ + AIRFLOW_INSTALLATION_METHOD="apache-airflow" \ + PATH=${PATH}:${AIRFLOW_USER_HOME_DIR}/.local/bin \ + AIRFLOW_PIP_VERSION=${AIRFLOW_PIP_VERSION} \ + AIRFLOW_USER_HOME_DIR=${AIRFLOW_USER_HOME_DIR} \ + AIRFLOW_HOME=${AIRFLOW_HOME} \ + AIRFLOW_UID=${AIRFLOW_UID} \ + INSTALL_MYSQL_CLIENT="true" \ + INSTALL_MSSQL_CLIENT="true" \ + PIP_USER="true" + +COPY scripts/docker/install_mysql.sh scripts/docker/install_mssql.sh /scripts/docker/ + +RUN /scripts/docker/install_mysql.sh dev && /scripts/docker/install_mssql.sh +ENV PATH=${PATH}:/opt/mssql-tools/bin + +RUN adduser --gecos "First Last,RoomNumber,WorkPhone,HomePhone" --disabled-password \ + --quiet "airflow" --uid "${AIRFLOW_UID}" --gid "0" --home "${AIRFLOW_USER_HOME_DIR}" && \ + mkdir -p ${AIRFLOW_HOME} && chown -R "airflow:0" "${AIRFLOW_USER_HOME_DIR}" ${AIRFLOW_HOME} + +USER airflow + +COPY --chown=airflow:0 scripts/docker/install_pip_version.sh /scripts/docker/ + +RUN /scripts/docker/install_pip_version.sh + +ENV INSTALL_FROM_PYPI="true" \ + EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS="dill<0.3.3 certifi<2021.0.0 google-ads<14.0.1" + +WORKDIR /opt/airflow + +COPY --chown=airflow:0 scripts/docker/install_airflow.sh /scripts/docker/ + +RUN /scripts/docker/install_airflow.sh \ + && find "${AIRFLOW_USER_HOME_DIR}/.local/" -name '*.pyc' -print0 | xargs -0 rm -f || true \ + && find "${AIRFLOW_USER_HOME_DIR}/.local/" -type d -name '__pycache__' -print0 | xargs -0 rm -rf || true \ + && find "${AIRFLOW_USER_HOME_DIR}/.local" -executable -print0 | xargs --null chmod g+x \ + && find "${AIRFLOW_USER_HOME_DIR}/.local" -print0 | xargs --null chmod g+rw + +FROM marketplace.gcr.io/google/c2d-debian11 as main + +SHELL ["/bin/bash", "-o", "pipefail", "-o", "errexit", "-o", "nounset", "-o", "nolog", "-c"] + +ARG AIRFLOW_USER_HOME_DIR +ARG AIRFLOW_HOME +ARG AIRFLOW_UID +ARG AIRFLOW_PIP_VERSION +ARG AIRFLOW_VERSION + +ENV AIRFLOW_VERSION=${AIRFLOW_VERSION} \ + DEBIAN_FRONTEND=noninteractive LANGUAGE=C.UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 \ + LC_CTYPE=C.UTF-8 LC_MESSAGES=C.UTF-8 \ + AIRFLOW_PIP_VERSION=${AIRFLOW_PIP_VERSION} + +ENV RUNTIME_APT_DEPS="\ + apt-transport-https \ + apt-utils \ + ca-certificates \ + curl \ + dumb-init \ + freetds-bin \ + krb5-user \ + ldap-utils \ + libffi7 \ + libldap-2.4-2 \ + libsasl2-2 \ + libsasl2-modules \ + libssl1.1 \ + locales \ + lsb-release \ + netcat \ + openssh-client \ + pkg-config \ + postgresql-client \ + python2 \ + python3 \ + python3-pip \ + rsync \ + sasl2-bin \ + sqlite3 \ + sudo \ + unixodbc" + +ENV RUNTIME_APT_COMMAND="echo" \ + INSTALL_MYSQL_CLIENT="true" \ + INSTALL_MSSQL_CLIENT="true" \ + AIRFLOW_INSTALLATION_METHOD="apache-airflow" \ + AIRFLOW_UID=${AIRFLOW_UID} \ + AIRFLOW__CORE__LOAD_EXAMPLES="false" \ + AIRFLOW_USER_HOME_DIR=${AIRFLOW_USER_HOME_DIR} \ + AIRFLOW_HOME=${AIRFLOW_HOME} \ + PATH="${AIRFLOW_USER_HOME_DIR}/.local/bin:${PATH}" \ + GUNICORN_CMD_ARGS="--worker-tmp-dir /dev/shm" \ + PIP_USER="true" + +RUN apt-get update \ + && apt-get install --no-install-recommends -yqq apt-utils >/dev/null 2>&1 \ + && apt-get install -y --no-install-recommends curl gnupg2 \ + && mkdir -pv /usr/share/man/man1 \ + && mkdir -pv /usr/share/man/man7 \ + && bash -o pipefail -o errexit -o nounset -o nolog -c "${RUNTIME_APT_COMMAND}" \ + && apt-get update \ + && apt-get install -y --no-install-recommends ${RUNTIME_APT_DEPS} \ + && apt-get autoremove -yqq --purge \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /var/log/* \ + && ln -s /usr/bin/python3 /usr/bin/python + +# Install Gosu +# /usr/sbin/gosu +ENV GOSU_VERSION 1.16 +RUN set -eux; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ca-certificates \ + gpg \ + gpgv \ + libjemalloc2 \ + pwgen \ + tzdata \ + xz-utils \ + zstd ; \ + savedAptMark="$(apt-mark showmanual)"; \ + apt-get install -y --no-install-recommends \ + dirmngr \ + gpg-agent \ + wget; \ + rm -rf /var/lib/apt/lists/*; \ + dpkgArch="$(dpkg --print-architecture | awk -F- '{ print $NF }')"; \ + wget -q -O /usr/sbin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-$dpkgArch"; \ + wget -q -O /usr/sbin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-$dpkgArch.asc"; \ + GNUPGHOME="$(mktemp -d)"; \ + export GNUPGHOME; \ + gpg --batch --keyserver hkps://keys.openpgp.org --recv-keys B42F6819007F00F88E364FD4036A9C25BF357DD4; \ + gpg --batch --verify /usr/sbin/gosu.asc /usr/sbin/gosu; \ + gpgconf --kill all; \ + rm -rf "$GNUPGHOME" /usr/sbin/gosu.asc; \ + apt-mark auto '.*' > /dev/null; \ + [ -z "$savedAptMark" ] || apt-mark manual $savedAptMark >/dev/null; \ + apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false; \ + chmod +x /usr/sbin/gosu; \ + gosu --version; \ + gosu nobody true + + +COPY scripts/docker/install_mysql.sh /scripts/docker/install_mssql.sh /scripts/docker/ +RUN chmod a+x /scripts/docker/install_mysql.sh /scripts/docker/install_mssql.sh \ + && sync \ + && /scripts/docker/install_mysql.sh prod \ + && /scripts/docker/install_mssql.sh \ + && adduser --gecos "First Last,RoomNumber,WorkPhone,HomePhone" --disabled-password \ + --quiet "airflow" --uid "${AIRFLOW_UID}" --gid "0" --home "${AIRFLOW_USER_HOME_DIR}" \ + && mkdir -pv "${AIRFLOW_HOME}" \ + && mkdir -pv "${AIRFLOW_HOME}/dags" \ + && mkdir -pv "${AIRFLOW_HOME}/logs" \ + && chown -R airflow:0 "${AIRFLOW_USER_HOME_DIR}" "${AIRFLOW_HOME}" \ + && chmod -R g+rw "${AIRFLOW_USER_HOME_DIR}" "${AIRFLOW_HOME}" \ + && find "${AIRFLOW_HOME}" -executable -print0 | xargs --null chmod g+x \ + && find "${AIRFLOW_USER_HOME_DIR}" -executable -print0 | xargs --null chmod g+x + +COPY --chown=airflow:0 --from=airflow-build-image \ + "${AIRFLOW_USER_HOME_DIR}/.local" "${AIRFLOW_USER_HOME_DIR}/.local" +COPY --chown=airflow:0 scripts/prod/entrypoint_prod.sh /entrypoint +COPY --chown=airflow:0 scripts/prod/clean-logs.sh /clean-logs + +RUN chmod a+x /entrypoint /clean-logs \ + && chmod g=u /etc/passwd \ + && chmod g+w "${AIRFLOW_USER_HOME_DIR}/.local" \ + && usermod -g 0 airflow -G 0 + +RUN sed --in-place=.bak "s/secure_path=\"/secure_path=\"\/.venv\/bin:/" /etc/sudoers + +ENV DUMB_INIT_SETSID="1" \ + PS1="(airflow)" \ + LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libstdc++.so.6" + +WORKDIR ${AIRFLOW_HOME} +EXPOSE 8080 +USER ${AIRFLOW_UID} + +ENV C2D_RELEASE 2.7.1 + +ENTRYPOINT ["/usr/bin/dumb-init", "--", "/entrypoint"] +CMD [""] diff --git a/docker/airflow/2/debian11/2.7/scripts/docker/install_airflow.sh b/docker/airflow/2/debian11/2.7/scripts/docker/install_airflow.sh new file mode 100755 index 0000000000..b9551ad5dd --- /dev/null +++ b/docker/airflow/2/debian11/2.7/scripts/docker/install_airflow.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +: "${AIRFLOW_PIP_VERSION:?Should be set}" + +function install_airflow() { + # Remove mysql from extras if client is not going to be installed + if [[ ${INSTALL_MYSQL_CLIENT} != "true" ]]; then + AIRFLOW_EXTRAS=${AIRFLOW_EXTRAS/mysql,} + fi + echo + echo "Installing all packages and upgrade if needed" + echo + pip install --upgrade --upgrade-strategy only-if-needed "${AIRFLOW_INSTALLATION_METHOD}[${AIRFLOW_EXTRAS}]==${AIRFLOW_VERSION}" airflow-exporter +} + +install_airflow diff --git a/docker/airflow/2/debian11/2.7/scripts/docker/install_mssql.sh b/docker/airflow/2/debian11/2.7/scripts/docker/install_mssql.sh new file mode 100755 index 0000000000..0f378c22e8 --- /dev/null +++ b/docker/airflow/2/debian11/2.7/scripts/docker/install_mssql.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +set -euo pipefail + +: "${INSTALL_MSSQL_CLIENT:?Should be true or false}" + +function install_mssql_client() { + echo + echo "Installing mssql client" + echo + curl --silent https://packages.microsoft.com/keys/microsoft.asc | apt-key add - >/dev/null 2>&1 + curl --silent https://packages.microsoft.com/config/debian/10/prod.list > /etc/apt/sources.list.d/mssql-release.list + apt-get update -yqq + apt-get upgrade -yqq + ACCEPT_EULA=Y apt-get -yqq install -y --no-install-recommends msodbcsql17 mssql-tools + rm -rf /var/lib/apt/lists/* + apt-get autoremove -yqq --purge + apt-get clean && rm -rf /var/lib/apt/lists/* +} + +# Install MsSQL client from Microsoft repositories +if [[ ${INSTALL_MSSQL_CLIENT:="true"} == "true" ]]; then + install_mssql_client "${@}" +fi diff --git a/docker/airflow/2/debian11/2.7/scripts/docker/install_mysql.sh b/docker/airflow/2/debian11/2.7/scripts/docker/install_mysql.sh new file mode 100755 index 0000000000..4073561a91 --- /dev/null +++ b/docker/airflow/2/debian11/2.7/scripts/docker/install_mysql.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +set -euo pipefail +declare -a packages + +MYSQL_VERSION="8.0" +readonly MYSQL_VERSION + +: "${INSTALL_MYSQL_CLIENT:?Should be true or false}" + +install_mysql_client() { + echo + echo "Installing mysql client version ${MYSQL_VERSION}" + echo + + if [[ "${1}" == "dev" ]]; then + packages=("libmysqlclient-dev" "mysql-client") + elif [[ "${1}" == "prod" ]]; then + packages=("libmysqlclient21" "mysql-client") + else + echo + echo "Specify either prod or dev" + echo + exit 1 + fi + + local key="467B942D3A79BD29" + readonly key + + GNUPGHOME="$(mktemp -d)" + export GNUPGHOME + set +e + for keyserver in $(shuf -e ha.pool.sks-keyservers.net hkp://p80.pool.sks-keyservers.net:80 \ + keyserver.ubuntu.com hkp://keyserver.ubuntu.com:80) + do + gpg --keyserver "${keyserver}" --recv-keys "${key}" 2>&1 && break + done + set -e + gpg --export "${key}" > /etc/apt/trusted.gpg.d/mysql.gpg + gpgconf --kill all + rm -rf "${GNUPGHOME}" + unset GNUPGHOME + echo "deb http://repo.mysql.com/apt/debian/ buster mysql-${MYSQL_VERSION}" | tee -a /etc/apt/sources.list.d/mysql.list + apt-get update + apt-get install --no-install-recommends -y "${packages[@]}" + apt-get autoremove -yqq --purge + apt-get clean && rm -rf /var/lib/apt/lists/* +} + +# Install MySQL client from Oracle repositories (Debian installs mariadb) +# But only if it is not disabled +if [[ ${INSTALL_MYSQL_CLIENT:="true"} == "true" ]]; then + install_mysql_client "${@}" +fi diff --git a/docker/airflow/2/debian11/2.7/scripts/docker/install_pip_version.sh b/docker/airflow/2/debian11/2.7/scripts/docker/install_pip_version.sh new file mode 100755 index 0000000000..ba15c70501 --- /dev/null +++ b/docker/airflow/2/debian11/2.7/scripts/docker/install_pip_version.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +: "${AIRFLOW_PIP_VERSION:?Should be set}" + +function install_pip_version() { + echo + echo "Installing pip version ${AIRFLOW_PIP_VERSION}" + echo + pip install --disable-pip-version-check --no-cache-dir --upgrade "pip==${AIRFLOW_PIP_VERSION}" && + mkdir -p ${HOME}/.local/bin +} + +install_pip_version diff --git a/docker/airflow/2/debian11/2.7/scripts/prod/clean-logs.sh b/docker/airflow/2/debian11/2.7/scripts/prod/clean-logs.sh new file mode 100755 index 0000000000..57b6e8b605 --- /dev/null +++ b/docker/airflow/2/debian11/2.7/scripts/prod/clean-logs.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -euo pipefail + +readonly DIRECTORY="${AIRFLOW_HOME:-/usr/local/airflow}" +readonly RETENTION="${AIRFLOW__LOG_RETENTION_DAYS:-15}" + +trap "exit" INT TERM + +readonly EVERY=$((15*60)) + +echo "Cleaning logs every $EVERY seconds" + +while true; do + echo "Trimming airflow logs to ${RETENTION} days." + find "${DIRECTORY}"/logs \ + -type d -name 'lost+found' -prune -o \ + -type f -mtime +"${RETENTION}" -name '*.log' -print0 | \ + xargs -0 rm -f + + seconds=$(( $(date -u +%s) % EVERY)) + (( seconds < 1 )) || sleep $((EVERY - seconds)) +done diff --git a/docker/airflow/2/debian11/2.7/scripts/prod/entrypoint_prod.sh b/docker/airflow/2/debian11/2.7/scripts/prod/entrypoint_prod.sh new file mode 100755 index 0000000000..6f74ce68b6 --- /dev/null +++ b/docker/airflow/2/debian11/2.7/scripts/prod/entrypoint_prod.sh @@ -0,0 +1,318 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# Might be empty +AIRFLOW_COMMAND="${1:-}" + +set -euo pipefail + +function run_check_with_retries { + local cmd + cmd="${1}" + local countdown + countdown="${CONNECTION_CHECK_MAX_COUNT}" + + while true + do + set +e + local last_check_result + local res + last_check_result=$(eval "${cmd} 2>&1") + res=$? + set -e + if [[ ${res} == 0 ]]; then + echo + break + else + echo -n "." + countdown=$((countdown-1)) + fi + if [[ ${countdown} == 0 ]]; then + echo + echo "ERROR! Maximum number of retries (${CONNECTION_CHECK_MAX_COUNT}) reached." + echo + echo "Last check result:" + echo "$ ${cmd}" + echo "${last_check_result}" + echo + exit 1 + else + sleep "${CONNECTION_CHECK_SLEEP_TIME}" + fi + done +} + +function run_nc() { + # Checks if it is possible to connect to the host using netcat. + # + # We want to avoid misleading messages and perform only forward lookup of the service IP address. + # Netcat when run without -n performs both forward and reverse lookup and fails if the reverse + # lookup name does not match the original name even if the host is reachable via IP. This happens + # randomly with docker-compose in GitHub Actions. + # Since we are not using reverse lookup elsewhere, we can perform forward lookup in python + # And use the IP in NC and add '-n' switch to disable any DNS use. + # Even if this message might be harmless, it might hide the real reason for the problem + # Which is the long time needed to start some services, seeing this message might be totally misleading + # when you try to analyse the problem, that's why it's best to avoid it, + local host="${1}" + local port="${2}" + local ip + ip=$(python -c "import socket; print(socket.gethostbyname('${host}'))") + nc -zvvn "${ip}" "${port}" +} + + +function wait_for_connection { + # Waits for Connection to the backend specified via URL passed as first parameter + # Detects backend type depending on the URL schema and assigns + # default port numbers if not specified in the URL. + # Then it loops until connection to the host/port specified can be established + # It tries `CONNECTION_CHECK_MAX_COUNT` times and sleeps `CONNECTION_CHECK_SLEEP_TIME` between checks + local connection_url + connection_url="${1}" + local detected_backend + detected_backend=$(python -c "from urllib.parse import urlsplit; import sys; print(urlsplit(sys.argv[1]).scheme)" "${connection_url}") + local detected_host + detected_host=$(python -c "from urllib.parse import urlsplit; import sys; print(urlsplit(sys.argv[1]).hostname)" "${connection_url}") + local detected_port + detected_port=$(python -c "from urllib.parse import urlsplit; import sys; print(urlsplit(sys.argv[1]).port or '')" "${connection_url}") + + echo BACKEND="${BACKEND:=${detected_backend}}" + readonly BACKEND + + if [[ -z "${detected_port=}" ]]; then + if [[ ${BACKEND} == "postgres"* ]]; then + detected_port=5432 + elif [[ ${BACKEND} == "mysql"* ]]; then + detected_port=3306 + elif [[ ${BACKEND} == "mssql"* ]]; then + detected_port=1433 + elif [[ ${BACKEND} == "redis"* ]]; then + detected_port=6379 + elif [[ ${BACKEND} == "amqp"* ]]; then + detected_port=5672 + fi + fi + + detected_host=${detected_host:="localhost"} + + # Allow the DB parameters to be overridden by environment variable + echo DB_HOST="${DB_HOST:=${detected_host}}" + readonly DB_HOST + + echo DB_PORT="${DB_PORT:=${detected_port}}" + readonly DB_PORT + run_check_with_retries "run_nc ${DB_HOST@Q} ${DB_PORT@Q}" +} + +function create_www_user() { + local local_password="" + # Warning: command environment variables (*_CMD) have priority over usual configuration variables + # for configuration parameters that require sensitive information. This is the case for the SQL database + # and the broker backend in this entrypoint script. + if [[ -n "${_AIRFLOW_WWW_USER_PASSWORD_CMD=}" ]]; then + local_password=$(eval "${_AIRFLOW_WWW_USER_PASSWORD_CMD}") + unset _AIRFLOW_WWW_USER_PASSWORD_CMD + elif [[ -n "${_AIRFLOW_WWW_USER_PASSWORD=}" ]]; then + local_password="${_AIRFLOW_WWW_USER_PASSWORD}" + unset _AIRFLOW_WWW_USER_PASSWORD + fi + if [[ -z ${local_password} ]]; then + echo + echo "ERROR! Airflow Admin password not set via _AIRFLOW_WWW_USER_PASSWORD or _AIRFLOW_WWW_USER_PASSWORD_CMD variables!" + echo + exit 1 + fi + + airflow users create \ + --username "${_AIRFLOW_WWW_USER_USERNAME="admin"}" \ + --firstname "${_AIRFLOW_WWW_USER_FIRSTNAME="Airflow"}" \ + --lastname "${_AIRFLOW_WWW_USER_LASTNAME="Admin"}" \ + --email "${_AIRFLOW_WWW_USER_EMAIL="airflowadmin@example.com"}" \ + --role "${_AIRFLOW_WWW_USER_ROLE="Admin"}" \ + --password "${local_password}" || true +} + +function create_system_user_if_missing() { + # This is needed in case of OpenShift-compatible container execution. In case of OpenShift random + # User id is used when starting the image, however group 0 is kept as the user group. Our production + # Image is OpenShift compatible, so all permissions on all folders are set so that 0 group can exercise + # the same privileges as the default "airflow" user, this code checks if the user is already + # present in /etc/passwd and will create the system user dynamically, including setting its + # HOME directory to the /home/airflow so that (for example) the ${HOME}/.local folder where airflow is + # Installed can be automatically added to PYTHONPATH + if ! whoami &> /dev/null; then + if [[ -w /etc/passwd ]]; then + echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${AIRFLOW_USER_HOME_DIR}:/sbin/nologin" \ + >> /etc/passwd + fi + export HOME="${AIRFLOW_USER_HOME_DIR}" + fi +} + +function set_pythonpath_for_root_user() { + # Airflow is installed as a local user application which means that if the container is running as root + # the application is not available. because Python then only load system-wide applications. + # Now also adds applications installed as local user "airflow". + if [[ $UID == "0" ]]; then + local python_major_minor + python_major_minor="$(python --version | cut -d " " -f 2 | cut -d "." -f 1-2)" + export PYTHONPATH="${AIRFLOW_USER_HOME_DIR}/.local/lib/python${python_major_minor}/site-packages:${PYTHONPATH:-}" + >&2 echo "The container is run as root user. For security, consider using a regular user account." + fi +} + +function wait_for_airflow_db() { + # Wait for the command to run successfully to validate the database connection. + run_check_with_retries "airflow db check" +} + +function upgrade_db() { + # Runs airflow db upgrade + airflow db upgrade || true +} + +function wait_for_celery_broker() { + # Verifies connection to Celery Broker + local executor + executor="$(airflow config get-value core executor)" + if [[ "${executor}" == "CeleryExecutor" ]]; then + local connection_url + connection_url="$(airflow config get-value celery broker_url)" + wait_for_connection "${connection_url}" + fi +} + +function exec_to_bash_or_python_command_if_specified() { + # If one of the commands: 'bash', 'python' is used, either run appropriate + # command with exec + if [[ ${AIRFLOW_COMMAND} == "bash" ]]; then + shift + exec "/bin/bash" "${@}" + elif [[ ${AIRFLOW_COMMAND} == "python" ]]; then + shift + exec "python" "${@}" + fi +} + +function check_uid_gid() { + if [[ $(id -g) == "0" ]]; then + return + fi + if [[ $(id -u) == "50000" ]]; then + >&2 echo + >&2 echo "WARNING! You should run the image with GID (Group ID) set to 0" + >&2 echo " even if you use 'airflow' user (UID=50000)" + >&2 echo + >&2 echo " You started the image with UID=$(id -u) and GID=$(id -g)" + >&2 echo + >&2 echo " This is to make sure you can run the image with an arbitrary UID in the future." + >&2 echo + >&2 echo " See more about it in the Airflow's docker image documentation" + >&2 echo " http://airflow.apache.org/docs/docker-stack/entrypoint" + >&2 echo + # We still allow the image to run with `airflow` user. + return + else + >&2 echo + >&2 echo "ERROR! You should run the image with GID=0" + >&2 echo + >&2 echo " You started the image with UID=$(id -u) and GID=$(id -g)" + >&2 echo + >&2 echo "The image should always be run with GID (Group ID) set to 0 regardless of the UID used." + >&2 echo " This is to make sure you can run the image with an arbitrary UID." + >&2 echo + >&2 echo " See more about it in the Airflow's docker image documentation" + >&2 echo " http://airflow.apache.org/docs/docker-stack/entrypoint" + # This will not work so we fail hard + exit 1 + fi +} + +# In Airflow image we are setting PIP_USER variable to true, in order to install all the packages +# by default with the ``--user`` flag. However this is a problem if a virtualenv is created later +# which happens in PythonVirtualenvOperator. We are unsetting this variable here, so that it is +# not set when PIP is run by Airflow later on +unset PIP_USER + +check_uid_gid + +# Set umask to 0002 to make all the directories created by the current user group-writeable +# This allows the same directories to be writeable for any arbitrary user the image will be +# run with, when the directory is created on a mounted volume and when that volume is later +# reused with a different UID (but with GID=0) +umask 0002 + +CONNECTION_CHECK_MAX_COUNT=${CONNECTION_CHECK_MAX_COUNT:=20} +readonly CONNECTION_CHECK_MAX_COUNT + +CONNECTION_CHECK_SLEEP_TIME=${CONNECTION_CHECK_SLEEP_TIME:=3} +readonly CONNECTION_CHECK_SLEEP_TIME + +create_system_user_if_missing +set_pythonpath_for_root_user +if [[ "${CONNECTION_CHECK_MAX_COUNT}" -gt "0" ]]; then + wait_for_airflow_db +fi + +if [[ -n "${_AIRFLOW_DB_UPGRADE=}" ]] ; then + upgrade_db +fi + +if [[ -n "${_AIRFLOW_WWW_USER_CREATE=}" ]] ; then + create_www_user +fi + +if [[ -n "${_PIP_ADDITIONAL_REQUIREMENTS=}" ]] ; then + >&2 echo + >&2 echo "!!!!! Installing additional requirements: '${_PIP_ADDITIONAL_REQUIREMENTS}' !!!!!!!!!!!!" + >&2 echo + >&2 echo "WARNING: This is a developpment/test feature only. NEVER use it in production!" + >&2 echo " Instead, build a custom image as described in" + >&2 echo + >&2 echo " https://airflow.apache.org/docs/docker-stack/build.html" + >&2 echo + >&2 echo " Adding requirements at container startup is fragile and is done every time" + >&2 echo " the container starts, so it is onlny useful for testing and trying out" + >&2 echo " of adding dependencies." + >&2 echo + pip install --no-cache-dir ${_PIP_ADDITIONAL_REQUIREMENTS} +fi + + +# The `bash` and `python` commands should also verify the basic connections +# So they are run after the DB check +exec_to_bash_or_python_command_if_specified "${@}" + +# Remove "airflow" if it is specified as airflow command +# This way both command types work the same way: +# +# docker run IMAGE airflow webserver +# docker run IMAGE webserver +# +if [[ ${AIRFLOW_COMMAND} == "airflow" ]]; then + AIRFLOW_COMMAND="${2:-}" + shift +fi + +# Note: the broker backend configuration concerns only a subset of Airflow components +if [[ ${AIRFLOW_COMMAND} =~ ^(scheduler|celery)$ ]] \ + && [[ "${CONNECTION_CHECK_MAX_COUNT}" -gt "0" ]]; then + wait_for_celery_broker +fi + +exec "airflow" "${@}" diff --git a/docker/airflow/versions.yaml b/docker/airflow/versions.yaml index a9acf44b79..b35decb9db 100644 --- a/docker/airflow/versions.yaml +++ b/docker/airflow/versions.yaml @@ -16,22 +16,39 @@ cloudbuild: enable_parallel: false versions: +- dir: 2/debian11/2.7 + from: marketplace.gcr.io/google/c2d-debian11 + packages: + airflow: + version: 2.7.1 + gosu: + gpg: B42F6819007F00F88E364FD4036A9C25BF357DD4 + version: '1.16' + pip: + version: 22.3.1 + repo: airflow2 + tags: + - 2.7.1-debian11 + - 2.7-debian11 + - 2-debian11 + - 2.7.1 + - '2.7' + - '2' + - latest - dir: 2/debian11/2.6 from: marketplace.gcr.io/google/c2d-debian11 packages: airflow: version: 2.6.3 gosu: - version: 1.16 gpg: B42F6819007F00F88E364FD4036A9C25BF357DD4 + version: '1.16' pip: version: 22.3.1 repo: airflow2 tags: - 2.6.3-debian11 - 2.6-debian11 - - 2-debian11 - 2.6.3 - '2.6' - - '2' - - latest +