Skip to content

Commit

Permalink
Finalized and worked on cloud
Browse files Browse the repository at this point in the history
  • Loading branch information
jomariya23156 committed Apr 2, 2024
1 parent ff25772 commit 8de5e6d
Show file tree
Hide file tree
Showing 10 changed files with 168 additions and 94 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/build_push_docker_hub.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ jobs:
build-args: |
AIRFLOW_HOME=/opt/airflow
build_and_push_ray_arm64: # Job for building and pushing the images
build_and_push_ray_arm64: # Job for building and pushing the Ray image for arm64 platform
name: Build and Push Ray arm64 Image
runs-on: ubuntu-latest
needs: shared_steps # Dependency on the shared steps
Expand Down Expand Up @@ -110,7 +110,7 @@ jobs:
MLFLOW_ARTIFACT_ROOT=/storage/mlruns
ARCH_TRAILING_IMG_NAME=-aarch64
build_and_push_ray_amd64: # Job for building and pushing the images
build_and_push_ray_amd64: # Job for building and pushing the Ray image for amd64 platform
name: Build and Push Ray amd64 Image
runs-on: ubuntu-latest
needs: shared_steps # Dependency on the shared steps
Expand Down
129 changes: 66 additions & 63 deletions .github/workflows/build_push_gke.yaml
Original file line number Diff line number Diff line change
@@ -1,63 +1,66 @@
# name: Build, Push, and Deploy to GKE

# on:
# push:
# branches:
# - master

# env:
# PROJECT_ID: ${{ secrets.GKE_PROJECT }}
# GKE_CLUSTER: sfmlops-cluster # cluster name
# GKE_ZONE: asia-southeast1-a # cluster zone
# IMAGE_TAG: ${{ github.sha }} # use commit sha as a image tag
# GAR_ZONE: asia-southeast1 # artifact registry zone
# GAR_REPO: sfmlops-registry # artifact registry repository

# jobs:
# build_and_push_images:
# name: Setup, Build, Publish, and Deploy to GCP
# runs-on: ubuntu-latest
# environment: production
# strategy:
# matrix:
# image: [
# { name: web-ui, context: ./services/web-ui, file: ./services/web-ui/Dockerfile, buildargs: "" },
# { name: training-service, context: ./services/training-service, file: ./services/training-service/Dockerfile, buildargs: "" },
# { name: data-producer, context: ./services/data-producer, file: ./services/data-producer/Dockerfile, buildargs: "" },
# { name: mlflow, context: ./services/mlflow, file: ./services/mlflow/Dockerfile, buildargs: "" },
# { name: airflow-spark, context: ./services/airflow, file: ./services/airflow/Dockerfile, buildargs: "--build-arg AIRFLOW_HOME=/opt/airflow" },
# { name: ray, context: ./services/ray, file: ./services/ray/Dockerfile, buildargs: "--build-arg MLFLOW_ARTIFACT_ROOT=/storage/mlruns --build-arg ARCH_TRAILING_IMG_NAME=-aarch64" },
# { name: forecast-service, context: ./services/forecast-service, file: ./services/forecast-service/Dockerfile, buildargs: "" }
# ]
# steps:
# - name: Checkout
# uses: actions/checkout@v3

# # setup gcloud cli
# - name: Authenticate
# id: auth
# uses: google-github-actions/auth@v2
# with:
# credentials_json: ${{ secrets.GKE_SA_KEY }}

# # config docker to use gcloud cli tool as a credential
# # helper for authentication
# - name: Docker config
# run: |-
# gcloud auth print-access-token | docker login -u oauth2accesstoken --password-stdin https://$GAR_ZONE-docker.pkg.dev

# # Get GKE credentials
# - name: Setup GKE credentials
# uses: google-github-actions/get-gke-credentials@v2
# with:
# cluster_name: ${{ env.GKE_CLUSTER }}
# location: ${{ env.GKE_ZONE }}

# - name: Build and push ${{ matrix.image.name }} Docker image
# run: |-
# docker build ${{ matrix.image.buildargs }} \
# --tag "$GAR_ZONE-docker.pkg.dev/$PROJECT_ID/$GAR_REPO/${{ matrix.image.name }}:${{ github.sha }}" \
# --tag "$GAR_ZONE-docker.pkg.dev/$PROJECT_ID/$GAR_REPO/${{ matrix.image.name }}:latest" \
# -f ${{ matrix.image.file }} ${{ matrix.image.context }}
# docker push "$GAR_ZONE-docker.pkg.dev/$PROJECT_ID/$GAR_REPO/${{ matrix.image.name }}:${{ github.sha }}"
# docker push "$GAR_ZONE-docker.pkg.dev/$PROJECT_ID/$GAR_REPO/${{ matrix.image.name }}:latest"
name: Build, Push, and Deploy to GKE

on:
push:
branches:
- master

env:
PROJECT_ID: ${{ secrets.GKE_PROJECT }}
GKE_CLUSTER: sfmlops-cluster # cluster name
GKE_ZONE: asia-southeast1-a # cluster zone
IMAGE_TAG: ${{ github.sha }} # use commit sha as a image tag
GAR_ZONE: asia-southeast1 # artifact registry zone
GAR_REPO: sfmlops-registry # artifact registry repository

jobs:
build_and_push_images:
name: Setup, Build, Publish, and Deploy to GCP
runs-on: ubuntu-latest
environment: production
strategy:
matrix:
image: [
{ name: web-ui, context: ./services/web-ui, file: ./services/web-ui/Dockerfile, buildargs: "" },
{ name: training-service, context: ./services/training-service, file: ./services/training-service/Dockerfile, buildargs: "" },
{ name: data-producer, context: ./services/data-producer, file: ./services/data-producer/Dockerfile, buildargs: "" },
{ name: mlflow, context: ./services/mlflow, file: ./services/mlflow/Dockerfile, buildargs: "" },
{ name: airflow-spark, context: ./services/airflow, file: ./services/airflow/Dockerfile, buildargs: "--build-arg AIRFLOW_HOME=/opt/airflow" },
{ name: ray, context: ./services/ray, file: ./services/ray/Dockerfile, buildargs: "--build-arg MLFLOW_ARTIFACT_ROOT=/storage/mlruns --build-arg ARCH_TRAILING_IMG_NAME=" },
{ name: forecast-service, context: ./services/forecast-service, file: ./services/forecast-service/Dockerfile, buildargs: "" }
]
steps:
- name: Checkout
uses: actions/checkout@v3

# setup gcloud cli
- name: Authenticate
id: auth
uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.GKE_SA_KEY }}

# config docker to use gcloud cli tool as a credential
# helper for authentication
- name: Docker config
run: |-
gcloud auth print-access-token | docker login -u oauth2accesstoken --password-stdin https://$GAR_ZONE-docker.pkg.dev
# Get GKE credentials
- name: Setup GKE credentials
uses: google-github-actions/get-gke-credentials@v2
with:
cluster_name: ${{ env.GKE_CLUSTER }}
location: ${{ env.GKE_ZONE }}

- name: Build and push ${{ matrix.image.name }} Docker image
run: |-
docker build ${{ matrix.image.buildargs }} \
--tag "$GAR_ZONE-docker.pkg.dev/$PROJECT_ID/$GAR_REPO/${{ matrix.image.name }}:${{ github.sha }}" \
--tag "$GAR_ZONE-docker.pkg.dev/$PROJECT_ID/$GAR_REPO/${{ matrix.image.name }}:latest" \
-f ${{ matrix.image.file }} ${{ matrix.image.context }}
docker push "$GAR_ZONE-docker.pkg.dev/$PROJECT_ID/$GAR_REPO/${{ matrix.image.name }}:${{ github.sha }}"
docker push "$GAR_ZONE-docker.pkg.dev/$PROJECT_ID/$GAR_REPO/${{ matrix.image.name }}:latest"
# you can also consider adding another step to run helm/kubectl commands for deployment
# but since sometimes the command freezes (espicially for airflow), I decide to not include it
57 changes: 34 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
# sales-forecast-mlops-at-scale
<h1 align="center"> Sales Forecast MLOps at Scale </h1>

Scalable End-to-end MLOps system for sales forecasting.
<p align="center"><b> ▶️ Highly scalable Cloud-native Machine Learning system ◀️ </b></p>

dataset: https://www.kaggle.com/datasets/pratyushakar/rossmann-store-sales
# Overview

Original docker-compose file: https://airflow.apache.org/docs/apache-airflow/2.8.3/docker-compose.yaml
Modification made:
- Removed postgres (connect to our existing with new username and pwd)
- Added env variable `SPARK_STREAM_CHECKPOINTS_PATH` and mount volume for this checkpoint
- Connect to `forecast_network` defined in our existing docker-compose
- Note when starting: need to specify both compose files i.e. `docker-compose -f docker-compose.yml -f docker-compose-airflow.yml`
From doc: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html
Scalable End-to-end MLOps system for sales forecasting.

# Development environment
1. Docker (ref: Docker version 24.0.6, build ed223bc)
Expand Down Expand Up @@ -47,40 +41,48 @@ Prerequisites: Docker and Kubernetes (in our case, it's Docker Desktop as pinned
## With Kubernetes/Helm (Local cluster)
*Note:* The system is quite large and heavy... we recommend running it locally just for testing for one go, then if it works, just go off to the cloud if you wanna play around longer OR stick with Docker Compose (it went smoother in our case)
1. Install Helm `bash install-helm.sh`
2. `cd sfmlops-helm` and `helm dependency build` to fetch all dependencies
3. Both install and upgrade the main chart: `helm upgrade --install --create-namespace -n mlops sfmlops-helm ./ -f values.yaml -f values-ray.yaml`
2. Create airflow namespace: `kubectl create namespace airflow`
3. Deploy the main chart:
1. `cd sfmlops-helm` and `helm dependency build` to fetch all dependencies
2. `helm upgrade --install --create-namespace -n mlops sfmlops-helm ./ -f values.yaml -f values-ray.yaml`
4. Deploy Kafka:
1. [Only 1st time] `helm repo add bitnami https://charts.bitnami.com/bitnami`
2. `helm -n kafka upgrade --install kafka-release oci://registry-1.docker.io/bitnamicharts/kafka --create-namespace --version 23.0.7 -f values-kafka.yaml`
5. Deploy Airflow:
1. [Only 1st time] `helm repo add apache-airflow https://airflow.apache.org`
2. `helm -n airflow upgrade --install airflow apache-airflow/airflow --create-namespace --version 1.13.1 -f values-airflow.yaml`
6. Forward Airflow UI port, so we can access: `kubectl port-forward svc/airflow-webserver 8080:8080 --namespace airflow`
7. Deploy Prometheus and Grafana:
3. Sometimes, you might get a timeout error from this command (if you do, it means your machine spec is too poor for this system (like mine lol)). It's totally fine. Just keep checking the status with kubectl, if all resources start up correctly, go with it otherwise try running the command again.
6. Deploy Prometheus and Grafana:
1. [Only 1st time] `helm repo add prometheus-community https://prometheus-community.github.io/helm-charts`
2. `helm -n monitoring upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack --create-namespace --version 57.2.0 -f values-kube-prometheus.yaml`
3. One of the good things about kube-prometheus-stack is that it comes with many pre-installed/pre-configured dashboards for Kubernetes. Feel free to explore!
8. That's it! Enjoy your highly-scalable Machine Learning system for Sales forecasting! :)
7. That's it! Enjoy your highly scalable Machine Learning system for Sales forecasting! :)

**Note:** If you want to change namespace `kafka` and/or release name `kafka-release` of Kafka, please also change them in `values.yaml` and `KAFKA_BOOTSTRAP_SERVER` env var in `values-airflow.yaml`. They are also used in templating.

**Note 2:** In Docker Compose, Ray has already been configured to pull the embedded dashboards from Grafana, but in Kubernetes, this process involves a lot more manual steps so we intentionally left it undone for ease of setup of this project. You can follow the guide [here](https://docs.ray.io/en/latest/cluster/kubernetes/k8s-ecosystem/prometheus-grafana.html) if you want to anyways.
**Note 2:** In Docker Compose, Ray has already been configured to pull the embedded dashboards from Grafana, but in Kubernetes, this process involves a lot more manual steps so we intentionally left it undone for ease of setup of this project. You can follow the guide [here](https://docs.ray.io/en/latest/cluster/kubernetes/k8s-ecosystem/prometheus-grafana.html) if you want to anyway.

## With Kubernetes/Helm (on GCP)
Prerequisites: GKE Cluster (Standard cluster, *NOT* Autopilot), Artifact Registry, Service Usage API
1. Follow this Medium blog. I recommend create a new Service Account with Owner role for quick and dirty run (but of course, please consult your cloud engineer if you have security concerns).
1. Follow this Medium blog. We recommend create a new Service Account with Owner role for quick and dirty run (but of course, please consult your cloud engineer if you have security concerns).
2. Download your Service Account's JSON key
3. Activate your service account: `gcloud auth activate-service-account --key-file=<path to the JSON key>`
4. Connect local kubectl to cloud `gcloud container clusters get-credentials <GKE_CLUSTER_NAME> --zone <GKE_ZONE> --project <PROJECT_NAME>`
5. Create a namespace for airflow in prior `kubectl create namespace airflow` because our main chart contains 1 manifest `spark-streaming-pvc` pointing to 'airflow' namespace instead of 'mlops' like the rest. On local, it works pretty fine but on GCP it seems like it cannot auto create 'airflow' namespace during helm install mlops namespace.
3. Activate your service account: `gcloud auth activate-service-account --key-file=<PATH_TO_JSON_KEY>`
4. Connect local kubectl to cloud: `gcloud container clusters get-credentials <GKE_CLUSTER_NAME> --zone <GKE_ZONE> --project <PROJECT_NAME>`
5. Now kubectl (and helm) will work in the context of the GKE environment.
6. Follow the steps in **With Kubernetes/Helm (Local cluster)** section
7. If you face a timeout error when running helm commands for airflow or the system struggles to setup and work correctly, I recommend trying to upgrade your machine type in the cluster.

**Note:** For the machine type of node pool in the GKE cluster, from experiments, `e2-medium` (default) is not quite enough, especially for Airflow and Ray. In our case, we're going for `e2-standard-8` with 1 node (explanation why only 1 node is in *Important note on MLflow's artifact path* section). We also found ourselves need to increase the quota for PVC in IAM too.

## Cleanup steps
1. `helm uninstall sfmlops-helm -n mlops`
2. `helm uninstall kafka-release -n kafka`
3. `helm uninstall airflow -n airflow`
4. `helm uninstall kube-prometheus-stack -n monitoring`

### Important note on MLflow's artifact path
In this setting, we set the MLflow's artifact path to point to the local path. Internally, MLflow expects this path to be accessible from both MLflow client and server side (honestly, I don't like this model too), which is meant to be an object storage path like S3 (AWS) or Cloud Storage (GCP). But for the full on-prem experience, we can create a docker volume and mount it to the EXACT same path on both client and server to remediate this. In Kubernetes local cluster, we can do the same thing by creating a PVC with `accessModes: ReadWriteOnce` (in `sfmlops-helm/templates/mlflow-pvc.yaml`). **But** for on-cloud Kubernetes with a typical multi-node cluster, if we want the PVC to be able to read and write across nodes, we need to set `accessModes: ReadWriteMany` which most cloud providers *DO NOT* support this type of PVC and encourage to use a centralized storage instead. Therefore, if you wanna just try it out and run for fun, you can use this exact setting and create a cluster with 1 node (it will behave mostly like a local Kubernetes cluster, just on the cloud). For real production environment, please create a cloud storage, remove `mlflow-pvc.yaml` and its mount paths, and change the artifact path variable `MLFLOW_ARTIFACT_ROOT` in `sfmlops-helm/templates/global-configmap.yaml` to the cloud storage path. This is an official [doc](https://mlflow.org/docs/latest/tracking/artifacts-stores.html) about this.

### Note on Kafka Docker Compose and Helm
Kafka services on Docker Compose and Halm are different in settings, mainly in Docker Compose, we use KRaft for config management (which is newer), but in Helm, we use ZooKeeper because, honestly, we're not managed to pull it off with KRaft, sorry :'( (It's quite complex).

Expand Down Expand Up @@ -131,8 +133,17 @@ In fact, you can submit the training jobs directly from **ANY** service in the s
### Using Ray with external Redis (in Docker Compose)
If we restart the Ray container, all previous job history will be gone because Ray store them in-memory only. We can add an external Redis to manage these variables but, from using, this seems very very unstable, this is also stated in the official doc that using external Redis supports only on-cloud / Kubernetes. But I wanna try and... from time-to-time during the development, I found that the Ray cluster do not accept the Job submission and show error `Job supervisor actor could not be scheduled: The actor is not schedulable: The node specified via NodeAffinitySchedulingStrategy doesn't exist any more or is infeasible, and soft=False was specified.`. I could fix that by removing all data in redis by running `docker-compose exec redis redis-cli FLUSHALL` AND/OR removing Ray container and rebuild it again. But it's annoying and time consuming. So in the end, I got rid of external Redis for Ray, Bye~.

## References
- Airflow Helm: https://airflow.apache.org/docs/helm-chart/stable/index.html
- Airflow Helm default values.yaml: https://github.com/apache/airflow/blob/main/chart/values.yaml
## References / Useful resources
- Ray sample config: https://github.com/ray-project/kuberay/tree/master/ray-operator/config/samples
- Bitnami Kafka Helm: https://github.com/bitnami/charts/tree/main/bitnami/kafka
- Airflow Helm: https://airflow.apache.org/docs/helm-chart/stable/index.html
- Airflow Helm default values.yaml: https://github.com/apache/airflow/blob/main/chart/values.yaml
- dataset: https://www.kaggle.com/datasets/pratyushakar/rossmann-store-sales
- Original Airflow's docker-compose file: https://airflow.apache.org/docs/apache-airflow/2.8.3/docker-compose.yaml

### Modifications made to Airflow docker-compose
- Removed postgres (connect to our existing with new username and pwd)
- Added env variable `SPARK_STREAM_CHECKPOINTS_PATH` and mount volume for this checkpoint
- Connect to `forecast_network` defined in our existing docker-compose
- Note when starting: need to specify both compose files i.e. `docker-compose -f docker-compose.yml -f docker-compose-airflow.yml`
From doc: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html
14 changes: 14 additions & 0 deletions sfmlops-helm/templates/forecast-service-deployment-service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,20 @@ spec:
labels:
component: {{ .Values.forecastServiceLabel }}
spec:
# this is a just-work solution for configuring permission for pvc,
# but introduces a security risk
# in real production, please setup user and group id properly in each image
# and assign enough permissions for that
initContainers:
- name: change-ownership-container
image: busybox
command: ["/bin/chmod", "-R", "777", {{ .Values.mlflowArtifactRoot }}] # Adjust mount path if needed
securityContext:
runAsUser: 0
privileged: true
volumeMounts:
- name: mlflow-data
mountPath: {{ .Values.mlflowArtifactRoot }}
volumes:
- name: mlflow-data
persistentVolumeClaim:
Expand Down
2 changes: 1 addition & 1 deletion sfmlops-helm/templates/global-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ data:
KAFKA_TOPIC: "sale_rossman_store"
SALES_TABLE_NAME: "rossman_sales"
FORECAST_TABLE_NAME: "forecast_results"
MLFLOW_ARTIFACT_ROOT: "/storage/mlruns/"
MLFLOW_ARTIFACT_ROOT: "/storage/mlruns/" # when working on-cloud, you must change this to object storage path like gs://<bucket>/<path>, or if you just wanna try out, create a cluster with only 1 node would do
AIRFLOW_UID: "501"
AIRFLOW_PROJ_DIR: "./services/airflow"
SPARK_STREAM_CHECKPOINTS_PATH: "/opt/airflow/spark_streaming_checkpoints"
Expand Down
3 changes: 1 addition & 2 deletions sfmlops-helm/templates/ingress.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@ metadata:
spec:
ingressClassName: nginx
rules:
- host: localhost
http:
- http:
paths:
# capture group 1 will match with empty str
# so this = no rewrite
Expand Down
Loading

0 comments on commit 8de5e6d

Please sign in to comment.