opendatahub-io · jiridanek · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024
diff --git a/components/odh-notebook-controller/Makefile b/components/odh-notebook-controller/Makefile
@@ -100,6 +100,8 @@ test-with-rbac-true: manifests generate fmt vet envtest ## Run tests.
 	ACK_GINKGO_DEPRECATIONS=1.16.5 \
 	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path)" \
 		go test ./controllers/... -ginkgo.v -ginkgo.progress -test.v -coverprofile cover.out
+	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path)" \
+		go test ./upgrade/... -ginkgo.v -ginkgo.progress -test.v -coverprofile cover_upgrade.out
 
 ##@ Build
 

diff --git a/components/odh-notebook-controller/config/crd/external/route.openshift.io_routes.yaml b/components/odh-notebook-controller/config/crd/external/route.openshift.io_routes.yaml
diff --git a/components/odh-notebook-controller/go.mod b/components/odh-notebook-controller/go.mod
@@ -12,6 +12,7 @@ require (
 	github.com/onsi/gomega v1.30.0
 	github.com/openshift/api v0.0.0-20190924102528-32369d4db2ad
 	github.com/pkg/errors v0.9.1
+	github.com/prometheus/client_model v0.5.0
 	github.com/stretchr/testify v1.8.4
 	go.uber.org/zap v1.26.0
 	k8s.io/api v0.29.0
@@ -42,6 +43,7 @@ require (
 	github.com/imdario/mergo v0.3.12 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
+	github.com/kubeflow/kubeflow/components/common v0.0.0-20220218084159-4ad0158e955e // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
 	github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
@@ -50,7 +52,6 @@ require (
 	github.com/nxadm/tail v1.4.8 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/prometheus/client_golang v1.18.0 // indirect
-	github.com/prometheus/client_model v0.5.0 // indirect
 	github.com/prometheus/common v0.45.0 // indirect
 	github.com/prometheus/procfs v0.12.0 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
@@ -78,3 +79,5 @@ require (
 )
 
 replace google.golang.org/grpc => google.golang.org/grpc v1.56.3
+
+replace github.com/kubeflow/kubeflow/components/notebook-controller => ../notebook-controller
diff --git a/components/odh-notebook-controller/go.sum b/components/odh-notebook-controller/go.sum
@@ -75,8 +75,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
-github.com/kubeflow/kubeflow/components/notebook-controller v0.0.0-20220728153354-fc09bd1eefb8 h1:IPD3fr1CTTtpT5Z7udPfQgawCyHsWK9vytFHaBLjIUI=
-github.com/kubeflow/kubeflow/components/notebook-controller v0.0.0-20220728153354-fc09bd1eefb8/go.mod h1:ROs2it0U7gdZQGoUNHG5XfhfBmkb5FV2uaJD2Sr8E+Y=
+github.com/kubeflow/kubeflow/components/common v0.0.0-20220218084159-4ad0158e955e h1:Ud6a+mkZ5pj+QqZnsIPPBk1WrABz1wYcd1n9CYjMMBA=
+github.com/kubeflow/kubeflow/components/common v0.0.0-20220218084159-4ad0158e955e/go.mod h1:uj1ImonZ+hHqWKfzZGWjuxl1uODjubBNrg4W8c38/ts=
 github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 h1:jWpvCLoY8Z/e3VKvlsiIGKtc+UG6U5vzxaoagmhXfyg=

diff --git a/components/odh-notebook-controller/upgrade/README.md b/components/odh-notebook-controller/upgrade/README.md
@@ -0,0 +1,90 @@
+## Motivation
+
+Thinking about not restarting user workbenches is hard and this test should provide an automated check for that.
+
+## Fidelity considerations
+
+Beside this, we need to also run [odh-e2e test](https://github.com/skodjob/odh-e2e) that use actual controllers built as containers installed from OLM bundle on an OpenShift cluster.
+That test is much more faithful to a real production deployment, although it's true we only run it with ephemeral short-lived single-node clusters.
+The advantage of the test here is that it can be run very quickly from the project checkout directory with the current sourcecode.
+It is enough to use `make envtest` and `go test` to run this, no building of images and no additional setup is required.
+Meaning that it's possible to iterate on this quickly and still have a fairly comprehensive and realistic upgrade test.
+
+# Test data for testing (simulated) operator upgrade
+
+Each directory holds CRs dumped from a particular Red Hat OpenShift AI deployment.
+We are using Red Hat OpenShift AI release versioning here to name the directories.
+
+| ODH version | RHOAI version | opendatahub-io/kubeflow release |
+|-------------|---------------|---------------------------------|
+|             | 2.13          |                                 |
+|             |               |                                 |
+
+The test deploys (current) CRDs, then deploys (old) CRs that simulate a lived-in product installation, then starts (current) controllers and asserts that nothing adverse happened.
+
+1. old CRs cannot be deployed with current CRDs
+   * that means we made an incompatible changes to CRDs and that change must be reverted
+2. current operator does not run correctly with old CRs
+   * incompatible change in the operator must be reverted and done again correctly
+3. current operator modifies old CRs in an impermissible way, such as if it modifies statefulset spec leading to notebook pod restart
+   * incompatible change in the operator must be reverted and done again correctly
+
+This way we will test upgrade from any old version to the current (in-development) version.
+This should mean that no supported upgrade path will surprise us, because we have tested a superset.
+If an upgrade from an older version is both unsupported and also actually not working correctly, only then we remove it from these tests.
+
+We will test even upgrades that are not supported according to the various support policies of vendors.
+If the upgrade can actually be performed correctly, that's fine; if not, we will remove the data directory, delete the test,
+and leave a note in the README why upgroade is not working.
+
+## More on envtest environment
+
+These tests run in kubernetes testenv, which is a minimal Kubernetes distribution of just the etcd and kube-apiserver.
+Notably it is without a kubelet and without other controllers, not even the [kube-controller-manager](https://kubernetes.io/docs/reference/command-line-tools-reference/kube-controller-manager) is present.
+OpenShift extensions are not present either, so we need to apply our own Route CRD.
+
+Since kubelet is missing, no actual containers are being started when things get deployed to the kube-apiserver.
+Furthermore, since kube-controller-manager is missing, there aren't even the standard controllers that create pods from deployments or statefulsets present.
+That means that any deployed statefulset will lay untouched and pod resources are not created from it.
+
+## Obtaining test data
+
+Use the commands given below to dump an existing namespace on a working, deployed cluster for the purpose of upgrade testing.
+Work in the Dashboard UI, we want to simulate a setup that an user might conceivable create.
+
+1. in DSCi, set certificates to Managed
+2. create DSC as the default one
+3. in dashboard, work as the `developer` user and create a `developer` namespace
+4. create a Data Connection, use the [AWS example credentials](https://docs.aws.amazon.com/STS/latest/APIReference/API_GetAccessKeyInfo.html)
+   * Id: `AKIAIOSFODNN7EXAMPLE`
+   * Key: `wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY`
+5. still as `developer`, spawn a Workbench
+
+## Dumping command
+
+Here's the commands that will dump what's in your `developer` namespace into yaml files for use in these tests.
+When adding a new RHOAI version, place the dump into `data/rhoai-version` and commit the files.
+
+```shell
+# dumps all resources in `developer` namespace to YAMLs
+kubectl api-resources --verbs=list --namespaced -o name > api-resources.txt
+cat api-resources.txt | \
+  grep -v '^events$' | grep -v '^events.events.k8s.io$' | grep -v '^packagemanifests.packages.operators.coreos.com$' | \
+  xargs -n 1 kubectl get -o=name -n developer > resource-names.txt
+cat resource-names.txt | while read line; do
+  mkdir -p "$(dirname ${line})"
+  kubectl get -o=yaml -n developer "${line}" > "${line}.yaml"
+done
+```
+
+## Audit logging
+
+In `data/configs` there is audit-policy for enabling audit logging from kube-apiserver.
+
+One useful `jq` command to filter out events by type from the audit log is
+
+```jq
+select(
+    .verb != "get" and .verb != "watch" and .verb != "list" and .verb != "create"
+)
+```