diff --git a/.github/workflows/build-images.yml b/.github/workflows/build-images.yml index 14071f54c49a9..439d02dfd0781 100644 --- a/.github/workflows/build-images.yml +++ b/.github/workflows/build-images.yml @@ -69,6 +69,7 @@ jobs: default-constraints-branch: ${{ steps.selective-checks.outputs.default-constraints-branch }} runs-on: ${{steps.selective-checks.outputs.runs-on}} is-self-hosted-runner: ${{ steps.selective-checks.outputs.is-self-hosted-runner }} + is-committer-build: ${{ steps.selective-checks.outputs.is-committer-build }} is-airflow-runner: ${{ steps.selective-checks.outputs.is-airflow-runner }} is-amd-runner: ${{ steps.selective-checks.outputs.is-amd-runner }} is-arm-runner: ${{ steps.selective-checks.outputs.is-arm-runner }} @@ -106,8 +107,6 @@ jobs: } }' --jq '.data.node.labels.nodes[]' | jq --slurp -c '[.[].name]' >> ${GITHUB_OUTPUT} if: github.event_name == 'pull_request_target' - # Retrieve it to be able to determine which files has changed in the incoming commit of the PR - # we checkout the target commit and its parent to be able to compare them - name: Cleanup repo run: docker run -v "${GITHUB_WORKSPACE}:/workspace" -u 0:0 bash -c "rm -rf /workspace/*" - uses: actions/checkout@v4 @@ -115,61 +114,32 @@ jobs: ref: ${{ env.TARGET_COMMIT_SHA }} persist-credentials: false fetch-depth: 2 - - name: "Setup python" - uses: actions/setup-python@v4 - with: - python-version: 3.8 - - name: "Retrieve defaults from branch_defaults.py" - # We cannot "execute" the branch_defaults.py python code here because that would be - # a security problem (we cannot run any code that comes from the sources coming from the PR. - # Therefore, we extract the branches via embedded Python code - # we need to do it before next step replaces checked-out breeze and scripts code coming from - # the PR, because the PR defaults have to be retrieved here. - id: defaults - run: | - python - <> ${GITHUB_ENV} - from pathlib import Path - import re - import sys - - DEFAULTS_CONTENT = Path('dev/breeze/src/airflow_breeze/branch_defaults.py').read_text() - BRANCH_PATTERN = r'^AIRFLOW_BRANCH = "(.*)"$' - CONSTRAINTS_BRANCH_PATTERN = r'^DEFAULT_AIRFLOW_CONSTRAINTS_BRANCH = "(.*)"$' - - branch = re.search(BRANCH_PATTERN, DEFAULTS_CONTENT, re.MULTILINE).group(1) - constraints_branch = re.search(CONSTRAINTS_BRANCH_PATTERN, DEFAULTS_CONTENT, re.MULTILINE).group(1) - - output = f""" - DEFAULT_BRANCH={branch} - DEFAULT_CONSTRAINTS_BRANCH={constraints_branch} - """.strip() - - print(output) - # Stdout is redirected to GITHUB_ENV but we also print it to stderr to see it in ci log - print(output, file=sys.stderr) - EOF - - name: Checkout target branch to 'target-airflow' folder to use ci/scripts and breeze from there. + #################################################################################################### + # WE ONLY DO THAT CHECKOUT ABOVE TO RETRIEVE THE TARGET COMMIT AND IT'S PARENT. DO NOT RUN ANY CODE + # RIGHT AFTER THAT AS WE ARE GOING TO RESTORE THE TARGET BRANCH CODE IN THE NEXT STEP. + #################################################################################################### + - name: Checkout target branch to use ci/scripts and breeze from there. uses: actions/checkout@v4 with: - path: "target-airflow" ref: ${{ github.base_ref }} persist-credentials: false - - name: > - Override "scripts/ci", "dev" and ".github/actions" with the target branch - so that the PR does not override it - # We should not override those scripts which become part of the image as they will not be - # changed in the image built - we should only override those that are executed to build - # the image. - shell: bash - run: | - rm -rfv "scripts/ci" - mv -v "target-airflow/scripts/ci" "scripts" - rm -rfv "dev" - mv -v "target-airflow/dev" "." - rm -rfv ".github/actions" - mv -v "target-airflow/.github/actions" ".github" + #################################################################################################### + # HERE EVERYTHING IS PERFECTLY SAFE TO RUN. AT THIS POINT WE HAVE THE TARGET BRANCH CHECKED OUT + # AND WE CAN RUN ANY CODE FROM IT. WE CAN RUN BREEZE COMMANDS, WE CAN RUN SCRIPTS, WE CAN RUN + # COMPOSITE ACTIONS. WE CAN RUN ANYTHING THAT IS IN THE TARGET BRANCH AND THERE IS NO RISK THAT + # CODE WILL BE RUN FROM THE PR. + #################################################################################################### + - name: "Setup python" + uses: actions/setup-python@v4 + with: + python-version: 3.8 - name: "Install Breeze" uses: ./.github/actions/breeze + #################################################################################################### + # WE RUN SELECTIVE CHECKS HERE USING THE TARGET COMMIT AND ITS PARENT TO BE ABLE TO COMPARE THEM + # AND SEE WHAT HAS CHANGED IN THE PR. THE CODE IS STILL RUN FROM THE TARGET BRANCH, SO IT IS SAFE + # TO RUN IT, WE ONLY PASS TARGET_COMMIT_SHA SO THAT SELECTIVE CHECKS CAN SEE WHAT'S COMING IN THE PR + #################################################################################################### - name: Selective checks id: selective-checks env: @@ -195,8 +165,7 @@ jobs: needs: [build-info] if: | needs.build-info.outputs.ci-image-build == 'true' && - github.event.pull_request.head.repo.full_name != 'apache/airflow' && - github.repository == 'apache/airflow' + github.event.pull_request.head.repo.full_name != 'apache/airflow' env: DEFAULT_BRANCH: ${{ needs.build-info.outputs.default-branch }} DEFAULT_CONSTRAINTS_BRANCH: ${{ needs.build-info.outputs.default-constraints-branch }} @@ -210,26 +179,44 @@ jobs: with: ref: ${{ needs.build-info.outputs.target-commit-sha }} persist-credentials: false + #################################################################################################### + # BE VERY CAREFUL HERE! THIS LINE AND THE END OF THE WARNING. HERE WE CHECK OUT THE TARGET + # COMMIT AND ITS PARENT TO BE ABLE TO COMPARE THEM BUT ALSO TO BE ABLE TO BUILD THE IMAGE FROM + # THE INCOMING PR, RATHER THAN FROM TARGET BRANCH. THIS IS A SECURITY RISK, BECAUSE THE PR + # CAN CONTAIN ANY CODE AND WE EXECUTE IT HERE. THEREFORE, WE NEED TO BE VERY CAREFUL WHAT WE + # DO HERE. WE SHOULD NOT EXECUTE ANY CODE THAT COMES FROM THE PR. WE SHOULD NOT RUN ANY BREEZE + # COMMAND NOR SCRIPTS NOR COMPOSITE ACTIONS. WE SHOULD ONLY RUN CODE THAT IS EMBEDDED DIRECTLY IN + # THIS WORKFLOW - BECAUSE THIS IS THE ONLY CODE THAT WE CAN TRUST. + #################################################################################################### - name: Checkout target branch to 'target-airflow' folder to use ci/scripts and breeze from there. uses: actions/checkout@v4 with: path: "target-airflow" ref: ${{ github.base_ref }} persist-credentials: false + if: needs.build-info.outputs.is-committer-build != 'true' - name: > - Override "scripts/ci", "dev" and ".github/actions" with the target branch - so that the PR does not override it - # We should not override those scripts which become part of the image as they will not be - # changed in the image built - we should only override those that are executed to build - # the image. + Replace "scripts/ci", "dev" and ".github/actions" with the target branch + so that the those directories are not coming from the PR shell: bash run: | + echo + echo -e "\033[33m Replace scripts, dev, actions with target branch for non-committer builds!\033[0m" + echo rm -rfv "scripts/ci" mv -v "target-airflow/scripts/ci" "scripts" rm -rfv "dev" mv -v "target-airflow/dev" "." rm -rfv ".github/actions" mv -v "target-airflow/.github/actions" ".github" + if: needs.build-info.outputs.is-committer-build != 'true' + #################################################################################################### + # HERE IT'S A BIT SAFER. THE `dev`, `scripts/ci` AND `.github/actions` ARE NOW COMING FROM THE + # BASE_REF - WHICH IS THE TARGET BRANCH OF THE PR. WE CAN TRUST THAT THOSE SCRIPTS ARE SAVE TO RUN. + # ALL THE REST OF THE CODE COMES FROM THE PR, AND FOR EXAMPLE THE CODE IN THE `Dockerfile.ci` CAN + # BE RUN SAFELY AS PART OF DOCKER BUILD. BECAUSE IT RUNS INSIDE THE DOCKER CONTAINER AND IT IS + # ISOLATED FROM THE RUNNER. + #################################################################################################### - name: > Build CI Images ${{needs.build-info.outputs.all-python-versions-list-as-string}}:${{env.IMAGE_TAG}} uses: ./.github/actions/build-ci-images @@ -252,8 +239,7 @@ jobs: needs: [build-info, build-ci-images] if: | needs.build-info.outputs.prod-image-build == 'true' && - github.event.pull_request.head.repo.full_name != 'apache/airflow' && - github.repository == 'apache/airflow' + github.event.pull_request.head.repo.full_name != 'apache/airflow' env: DEFAULT_BRANCH: ${{ needs.build-info.outputs.default-branch }} DEFAULT_CONSTRAINTS_BRANCH: ${{ needs.build-info.outputs.default-constraints-branch }} @@ -268,26 +254,44 @@ jobs: with: ref: ${{ needs.build-info.outputs.target-commit-sha }} persist-credentials: false + #################################################################################################### + # BE VERY CAREFUL HERE! THIS LINE AND THE END OF THE WARNING. HERE WE CHECK OUT THE TARGET + # COMMIT AND ITS PARENT TO BE ABLE TO COMPARE THEM BUT ALSO TO BE ABLE TO BUILD THE IMAGE FROM + # THE INCOMING PR, RATHER THAN FROM TARGET BRANCH. THIS IS A SECURITY RISK, BECAUSE THE PR + # CAN CONTAIN ANY CODE AND WE EXECUTE IT HERE. THEREFORE, WE NEED TO BE VERY CAREFUL WHAT WE + # DO HERE. WE SHOULD NOT EXECUTE ANY CODE THAT COMES FROM THE PR. WE SHOULD NOT RUN ANY BREEZE + # COMMAND NOR SCRIPTS NOR COMPOSITE ACTIONS. WE SHOULD ONLY RUN CODE THAT IS EMBEDDED DIRECTLY IN + # THIS WORKFLOW - BECAUSE THIS IS THE ONLY CODE THAT WE CAN TRUST. + #################################################################################################### - name: Checkout target branch to 'target-airflow' folder to use ci/scripts and breeze from there. uses: actions/checkout@v4 with: path: "target-airflow" ref: ${{ github.base_ref }} persist-credentials: false + if: needs.build-info.outputs.is-committer-build != 'true' - name: > - Override "scripts/ci", "dev" and ".github/actions" with the target branch - so that the PR does not override it - # We should not override those scripts which become part of the image as they will not be - # changed in the image built - we should only override those that are executed to build - # the image. + Replace "scripts/ci", "dev" and ".github/actions" with the target branch + so that the those directories are not coming from the PR shell: bash run: | + echo + echo -e "\033[33m Replace scripts, dev, actions with target branch for non-committer builds!\033[0m" + echo rm -rfv "scripts/ci" mv -v "target-airflow/scripts/ci" "scripts" rm -rfv "dev" mv -v "target-airflow/dev" "." rm -rfv ".github/actions" mv -v "target-airflow/.github/actions" ".github" + if: needs.build-info.outputs.is-committer-build != 'true' + #################################################################################################### + # HERE IT'S A BIT SAFER. THE `dev`, `scripts/ci` AND `.github/actions` ARE NOW COMING FROM THE + # BASE_REF - WHICH IS THE TARGET BRANCH OF THE PR. WE CAN TRUST THAT THOSE SCRIPTS ARE SAVE TO RUN. + # ALL THE REST OF THE CODE COMES FROM THE PR, AND FOR EXAMPLE THE CODE IN THE `Dockerfile.ci` CAN + # BE RUN SAFELY AS PART OF DOCKER BUILD. BECAUSE IT RUNS INSIDE THE DOCKER CONTAINER AND IT IS + # ISOLATED FROM THE RUNNER. + #################################################################################################### - name: "Install Breeze" uses: ./.github/actions/breeze - name: > @@ -314,8 +318,7 @@ jobs: needs.build-info.outputs.upgrade-to-newer-dependencies != 'false' && github.event.pull_request.head.repo.full_name != 'apache/airflow' && needs.build-info.outputs.is-self-hosted-runner == 'true' && - needs.build-info.outputs.is-airflow-runner == 'true' && - github.repository == 'apache/airflow' + needs.build-info.outputs.is-airflow-runner == 'true' env: DEFAULT_BRANCH: ${{ needs.build-info.outputs.default-branch }} DEFAULT_CONSTRAINTS_BRANCH: ${{ needs.build-info.outputs.default-constraints-branch }} @@ -330,26 +333,44 @@ jobs: with: ref: ${{ needs.build-info.outputs.target-commit-sha }} persist-credentials: false + #################################################################################################### + # BE VERY CAREFUL HERE! THIS LINE AND THE END OF THE WARNING. HERE WE CHECK OUT THE TARGET + # COMMIT AND ITS PARENT TO BE ABLE TO COMPARE THEM BUT ALSO TO BE ABLE TO BUILD THE IMAGE FROM + # THE INCOMING PR, RATHER THAN FROM TARGET BRANCH. THIS IS A SECURITY RISK, BECAUSE THE PR + # CAN CONTAIN ANY CODE AND WE EXECUTE IT HERE. THEREFORE, WE NEED TO BE VERY CAREFUL WHAT WE + # DO HERE. WE SHOULD NOT EXECUTE ANY CODE THAT COMES FROM THE PR. WE SHOULD NOT RUN ANY BREEZE + # COMMAND NOR SCRIPTS NOR COMPOSITE ACTIONS. WE SHOULD ONLY RUN CODE THAT IS EMBEDDED DIRECTLY IN + # THIS WORKFLOW - BECAUSE THIS IS THE ONLY CODE THAT WE CAN TRUST. + #################################################################################################### - name: Checkout target branch to 'target-airflow' folder to use ci/scripts and breeze from there. uses: actions/checkout@v4 with: path: "target-airflow" ref: ${{ github.base_ref }} persist-credentials: false + if: needs.build-info.outputs.is-committer-build != 'true' - name: > - Override "scripts/ci", "dev" and ".github/actions" with the target branch - so that the PR does not override it - # We should not override those scripts which become part of the image as they will not be - # changed in the image built - we should only override those that are executed to build - # the image. + Replace "scripts/ci", "dev" and ".github/actions" with the target branch + so that the those directories are not coming from the PR shell: bash run: | + echo + echo -e "\033[33m Replace scripts, dev, actions with target branch for non-committer builds!\033[0m" + echo rm -rfv "scripts/ci" mv -v "target-airflow/scripts/ci" "scripts" rm -rfv "dev" mv -v "target-airflow/dev" "." rm -rfv ".github/actions" mv -v "target-airflow/.github/actions" ".github" + if: needs.build-info.outputs.is-committer-build != 'true' + #################################################################################################### + # HERE IT'S A BIT SAFER. THE `dev`, `scripts/ci` AND `.github/actions` ARE NOW COMING FROM THE + # BASE_REF - WHICH IS THE TARGET BRANCH OF THE PR. WE CAN TRUST THAT THOSE SCRIPTS ARE SAVE TO RUN. + # ALL THE REST OF THE CODE COMES FROM THE PR, AND FOR EXAMPLE THE CODE IN THE `Dockerfile.ci` CAN + # BE RUN SAFELY AS PART OF DOCKER BUILD. BECAUSE IT RUNS INSIDE THE DOCKER CONTAINER AND IT IS + # ISOLATED FROM THE RUNNER. + #################################################################################################### - name: "Start ARM instance" run: ./scripts/ci/images/ci_start_arm_instance_and_connect_to_docker.sh - name: "Install Breeze" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 38abcfc9d47ff..5795602cbef59 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -146,34 +146,6 @@ jobs: persist-credentials: false - name: "Install Breeze" uses: ./.github/actions/breeze - - name: "Retrieve defaults from branch_defaults.py" - id: defaults - # We could retrieve it differently here - by just importing the variables and - # printing them from python code, however we want to have the same code as used in - # the build-images.yml (there we cannot import python code coming from the PR - we need to - # treat the python code as text and extract the variables from there. - run: | - python - <> ${GITHUB_ENV} - from pathlib import Path - import re - import sys - - DEFAULTS_CONTENT = Path('dev/breeze/src/airflow_breeze/branch_defaults.py').read_text() - BRANCH_PATTERN = r'^AIRFLOW_BRANCH = "(.*)"$' - CONSTRAINTS_BRANCH_PATTERN = r'^DEFAULT_AIRFLOW_CONSTRAINTS_BRANCH = "(.*)"$' - - branch = re.search(BRANCH_PATTERN, DEFAULTS_CONTENT, re.MULTILINE).group(1) - constraints_branch = re.search(CONSTRAINTS_BRANCH_PATTERN, DEFAULTS_CONTENT, re.MULTILINE).group(1) - - output = f""" - DEFAULT_BRANCH={branch} - DEFAULT_CONSTRAINTS_BRANCH={constraints_branch} - """.strip() - - print(output) - # Stdout is redirected to GITHUB_ENV but we also print it to stderr to see it in ci log - print(output, file=sys.stderr) - EOF - name: "Get information about the Workflow" id: source-run-info run: breeze ci get-workflow-info 2>> ${GITHUB_OUTPUT} diff --git a/dev/breeze/doc/08_ci_tasks.rst b/dev/breeze/doc/08_ci_tasks.rst index d6594cf8d9c36..13b3cb2666abf 100644 --- a/dev/breeze/doc/08_ci_tasks.rst +++ b/dev/breeze/doc/08_ci_tasks.rst @@ -91,7 +91,7 @@ The selective-check command will produce the set of ``name=value`` pairs of outp from the context of the commit/PR to be merged via stderr output. More details about the algorithm used to pick the right tests and the available outputs can be -found in `Selective Checks `_. +found in `Selective Checks `_. These are all available flags of ``selective-check`` command: diff --git a/dev/breeze/doc/ci/04_static_checks.md b/dev/breeze/doc/ci/04_selective_checks.md similarity index 63% rename from dev/breeze/doc/ci/04_static_checks.md rename to dev/breeze/doc/ci/04_selective_checks.md index 9162012aff113..e4e1d8b3fc60a 100644 --- a/dev/breeze/doc/ci/04_static_checks.md +++ b/dev/breeze/doc/ci/04_selective_checks.md @@ -27,6 +27,8 @@ - [Skipping pre-commits (Static checks)](#skipping-pre-commits-static-checks) - [Suspended providers](#suspended-providers) - [Selective check outputs](#selective-check-outputs) + - [Committer vs. non-committer PRs](#committer-vs-non-committer-prs) + - [PR labels](#pr-labels) @@ -147,8 +149,6 @@ when some files are not changed. Those are the rules implemented: * if no `All Providers Python files` and no `All Providers Yaml files` are changed - `check-provider-yaml-valid` check is skipped - - ## Suspended providers The selective checks will fail in PR if it contains changes to a suspended provider unless you set the @@ -164,51 +164,65 @@ separated by spaces. This is to accommodate for the wau how outputs of this kind Github Actions to pass the list of parameters to a command to execute -| Output | Meaning of the output | Example value | List as string | -|------------------------------------|---------------------------------------------------------------------------------------------------------|-----------------------------------------------------|----------------| -| affected-providers-list-as-string | List of providers affected when they are selectively affected. | airbyte http | * | -| all-python-versions | List of all python versions there are available in the form of JSON array | ['3.8', '3.9', '3.10'] | | -| all-python-versions-list-as-string | List of all python versions there are available in the form of space separated string | 3.8 3.9 3.10 | * | -| basic-checks-only | Whether to run all static checks ("false") or only basic set of static checks ("true") | false | | -| cache-directive | Which cache should be used for images ("registry", "local" , "disabled") | registry | | -| debug-resources | Whether resources usage should be printed during parallel job execution ("true"/ "false") | false | | -| default-branch | Which branch is default for the build ("main" for main branch, "v2-4-test" for 2.4 line etc.) | main | | -| default-constraints-branch | Which branch is default for the build ("constraints-main" for main branch, "constraints-2-4" etc.) | constraints-main | | -| default-helm-version | Which Helm version to use as default | v3.9.4 | | -| default-kind-version | Which Kind version to use as default | v0.16.0 | | -| default-kubernetes-version | Which Kubernetes version to use as default | v1.25.2 | | -| default-mysql-version | Which MySQL version to use as default | 5.7 | | -| default-postgres-version | Which Postgres version to use as default | 10 | | -| default-python-version | Which Python version to use as default | 3.8 | | -| docs-build | Whether to build documentation ("true"/"false") | true | | -| docs-list-as-string | What filter to apply to docs building - based on which documentation packages should be built | apache-airflow helm-chart google | | -| full-tests-needed | Whether this build runs complete set of tests or only subset (for faster PR builds) | false | | -| helm-version | Which Helm version to use for tests | v3.9.4 | | -| ci-image-build | Whether CI image build is needed | true | | -| prod-image-build | Whether PROD image build is needed | true | | -| kind-version | Which Kind version to use for tests | v0.16.0 | | -| kubernetes-combos-list-as-string | All combinations of Python version and Kubernetes version to use for tests as space-separated string | 3.8-v1.25.2 3.9-v1.26.4 | * | -| kubernetes-versions | All Kubernetes versions to use for tests as JSON array | ['v1.25.2'] | | -| kubernetes-versions-list-as-string | All Kubernetes versions to use for tests as space-separated string | v1.25.2 | * | -| mysql-exclude | Which versions of MySQL to exclude for tests as JSON array | [] | | -| mysql-versions | Which versions of MySQL to use for tests as JSON array | ['5.7'] | | -| needs-api-codegen | Whether "api-codegen" are needed to run ("true"/"false") | true | | -| needs-api-tests | Whether "api-tests" are needed to run ("true"/"false") | true | | -| needs-helm-tests | Whether Helm tests are needed to run ("true"/"false") | true | | -| needs-javascript-scans | Whether javascript CodeQL scans should be run ("true"/"false") | true | | -| needs-python-scans | Whether Python CodeQL scans should be run ("true"/"false") | true | | -| parallel-test-types-list-as-string | Which test types should be run for unit tests | API Always Providers\[amazon\] Providers\[-amazon\] | * | -| postgres-exclude | Which versions of Postgres to exclude for tests as JSON array | [] | | -| postgres-versions | Which versions of Postgres to use for tests as JSON array | ['10'] | | -| python-versions | Which versions of Python to use for tests as JSON array | ['3.8'] | | -| python-versions-list-as-string | Which versions of MySQL to use for tests as space-separated string | 3.8 | * | -| run-kubernetes-tests | Whether Kubernetes tests should be run ("true"/"false") | true | | -| run-tests | Whether unit tests should be run ("true"/"false") | true | | -| run-www-tests | Whether WWW tests should be run ("true"/"false") | true | | -| skip-pre-commits | Which pre-commits should be skipped during the static-checks run | true | | -| skip-provider-tests | When provider tests should be skipped (on non-main branch or when no provider changes detected) | true | | -| sqlite-exclude | Which versions of Sqlite to exclude for tests as JSON array | [] | | -| upgrade-to-newer-dependencies | Whether the image build should attempt to upgrade all dependencies (might be true/false or commit hash) | false | | +| Output | Meaning of the output | Example value | List as string | +|------------------------------------|------------------------------------------------------------------------------------------------------|--------------------------------------------|----------------| +| affected-providers-list-as-string | List of providers affected when they are selectively affected. | airbyte http | * | +| all-python-versions | List of all python versions there are available in the form of JSON array | ['3.8', '3.9', '3.10'] | | +| all-python-versions-list-as-string | List of all python versions there are available in the form of space separated string | 3.8 3.9 3.10 | * | +| basic-checks-only | Whether to run all static checks ("false") or only basic set of static checks ("true") | false | | +| cache-directive | Which cache should be used for images ("registry", "local" , "disabled") | registry | | +| chicken-egg-providers | List of providers that should be considered as "chicken-egg" - expecting development Airflow version | | | +| ci-image-build | Whether CI image build is needed | true | | +| debug-resources | Whether resources usage should be printed during parallel job execution ("true"/ "false") | false | | +| default-branch | Which branch is default for the build ("main" for main branch, "v2-4-test" for 2.4 line etc.) | main | | +| default-constraints-branch | Which branch is default for the build ("constraints-main" for main branch, "constraints-2-4" etc.) | constraints-main | | +| default-helm-version | Which Helm version to use as default | v3.9.4 | | +| default-kind-version | Which Kind version to use as default | v0.16.0 | | +| default-kubernetes-version | Which Kubernetes version to use as default | v1.25.2 | | +| default-mysql-version | Which MySQL version to use as default | 5.7 | | +| default-postgres-version | Which Postgres version to use as default | 10 | | +| default-python-version | Which Python version to use as default | 3.8 | | +| docs-build | Whether to build documentation ("true"/"false") | true | | +| docs-list-as-string | What filter to apply to docs building - based on which documentation packages should be built | apache-airflow helm-chart google | | +| full-tests-needed | Whether this build runs complete set of tests or only subset (for faster PR builds) [1] | false | | +| helm-version | Which Helm version to use for tests | v3.9.4 | | +| is-airflow-runner | Whether runner used is an airflow or infrastructure runner (true if airflow/false if infrastructure) | false | | +| is-amd-runner | Whether runner used is an AMD one | true | | +| is-arm-runner | Whether runner used is an ARM one | false | | +| is-committer-build | Whether the build is triggered by a committer | false | | +| is-k8s-runner | Whether the build runs on our k8s infrastructure | false | | +| is-self-hosted-runner | Whether the runner is self-hosted | false | | +| is-vm-runner | Whether the runner uses VM to run | true | | +| kind-version | Which Kind version to use for tests | v0.16.0 | | +| kubernetes-combos-list-as-string | All combinations of Python version and Kubernetes version to use for tests as space-separated string | 3.8-v1.25.2 3.9-v1.26.4 | * | +| kubernetes-versions | All Kubernetes versions to use for tests as JSON array | ['v1.25.2'] | | +| kubernetes-versions-list-as-string | All Kubernetes versions to use for tests as space-separated string | v1.25.2 | * | +| mypy-folders | List of folders to be considered for mypy | [] | | +| mysql-exclude | Which versions of MySQL to exclude for tests as JSON array | [] | | +| mysql-versions | Which versions of MySQL to use for tests as JSON array | ['5.7'] | | +| needs-api-codegen | Whether "api-codegen" are needed to run ("true"/"false") | true | | +| needs-api-tests | Whether "api-tests" are needed to run ("true"/"false") | true | | +| needs-helm-tests | Whether Helm tests are needed to run ("true"/"false") | true | | +| needs-javascript-scans | Whether javascript CodeQL scans should be run ("true"/"false") | true | | +| needs-mypy | Whether mypy check is supposed to run in this build | true | | +| needs-python-scans | Whether Python CodeQL scans should be run ("true"/"false") | true | | +| parallel-test-types-list-as-string | Which test types should be run for unit tests | API Always Providers Providers\[-google\] | * | +| postgres-exclude | Which versions of Postgres to exclude for tests as JSON array | [] | | +| postgres-versions | Which versions of Postgres to use for tests as JSON array | ['10'] | | +| prod-image-build | Whether PROD image build is needed | true | | +| prod-image-build | Whether PROD image build is needed | true | | +| providers-compatibility-checks | List of dicts: (python_version, airflow_version, removed_providers) for compatibility checks | [] | | +| python-versions | List of python versions to use for that build | ['3.8'] | * | +| python-versions-list-as-string | Which versions of MySQL to use for tests as space-separated string | 3.8 | * | +| run-amazon-tests | Whether Amazon tests should be run ("true"/"false") | true | | +| run-kubernetes-tests | Whether Kubernetes tests should be run ("true"/"false") | true | | +| run-tests | Whether unit tests should be run ("true"/"false") | true | | +| run-www-tests | Whether WWW tests should be run ("true"/"false") | true | | +| runs-on | List of labels assigned for runners for that build (used to select runners) | ["ubuntu-22.04"] | | +| skip-pre-commits | Which pre-commits should be skipped during the static-checks run | check-provider-yaml-valid,flynt,identity | | +| skip-provider-tests | When provider tests should be skipped (on non-main branch or when no provider changes detected) | true | | +| sqlite-exclude | Which versions of Sqlite to exclude for tests as JSON array | [] | | +| upgrade-to-newer-dependencies | Whether the image build should attempt to upgrade all dependencies (true/false or commit hash) | false | | [1] Note for deciding if `full tests needed` mode is enabled and provider.yaml files. @@ -225,7 +239,45 @@ for a lot of information for each provider and sometimes (for example when we up or when new Hook class is added), we do not need to run full tests. That's why we do not base our `full tests needed` decision on changes in dependency files that are generated -from the `provider.yaml` files. +from the `provider.yaml` files, but on `generated/provider_dependencies.json` and `pyproject.toml` files being +modified. This can be overridden by setting `full tests needed` label in the PR. + +## Committer vs. non-committer PRs + +There is a difference in how the CI jobs are run for committer and non-committer PRs from forks. +Main reason is security - we do not want to run untrusted code on our infrastructure for self-hosted runners, +but also we do not want to run unverified code during the `Build imaage` workflow, because that workflow has +access to GITHUB_TOKEN that has access to write to the Github Registry of ours (which is used to cache +images between runs). Also those images are build on self-hosted runners and we have to make sure that +those runners are not used to (fore example) mine cryptocurrencies on behalf of the person who opened the +pull request from their newly opened fork of airflow. + +This is why the `Build Images` workflow checks if the actor of the PR (GITHUB_ACTOR) is one of the committers, +and if not, then workflows and scripts used to run image building are coming only from the ``target`` branch +of the repository, where such scripts were reviewed and approved by the committers before being merged. + +This is controlled by `Selective checks <04_selective_checks.md>`__ that set appropriate output in +the build-info job of the workflow (see`is-committer-build` to `true`) if the actor is in the committer's +list and can be overridden by `non committer build` label in the PR. + +Also, for most of the jobs, committer builds by default use "Self-hosted" runners, while non-committer +builds use "Public" runners. For committers, this can be overridden by setting the +`use public runners` label in the PR. + + +## PR labels + +As mentioned below, you can influence the outputs of selected checks by setting labels to the PR. Here is +am overview of possible labels and their meaning: + +| Label | Affected outputs | Meaning | +|-------------------------------|-------------------------------|-------------------------------------------------------------------------------------------------------| +| full tests needed | full-tests-needed | Run complete set of tests, including all Python all DB versions, and all test types. | +| debug ci resources | debug-ci-resources | If set, then debugging resources is enabled during parallel tests and you can see them in the output. | +| use public runners | runs-on | Force using public runners even for Committer runs. | +| non committer build | is-committer-build | If set then even for non-committer builds, the scripts used for images are used from target branch. | +| upgrade to newer dependencies | upgrade-to-newer-dependencies | If set then dependencies in the CI image build are upgraded to the newer ones. | + ----- diff --git a/dev/breeze/doc/ci/05_workflows.md b/dev/breeze/doc/ci/05_workflows.md index ab914f2329ae2..8310b9ce30532 100644 --- a/dev/breeze/doc/ci/05_workflows.md +++ b/dev/breeze/doc/ci/05_workflows.md @@ -28,6 +28,7 @@ - [Workflows](#workflows) - [Build Images Workflow](#build-images-workflow) - [Differences for main and release branches](#differences-for-main-and-release-branches) + - [Committer vs. non-committer PRs](#committer-vs-non-committer-prs) - [Tests Workflow](#tests-workflow) - [CodeQL scan](#codeql-scan) - [Publishing documentation](#publishing-documentation) @@ -84,6 +85,13 @@ CI/CD scripts or changes to the CI/CD workflows). In this case the PR is run in the context of the "apache/airflow" repository and has WRITE access to the GitHub Container Registry. +When the PR changes important files (for example `generated/provider_depdencies.json` or +`pyproject.toml`), the PR is run in "upgrade to newer dependencies" mode - where instead +of using constraints to build images, attempt is made to upgrade all dependencies to latest +versions and build images with them. This way we check how Airflow behaves when the +dependencies are upgraded. This can also be forced by setting the `upgrade to newer dependencies` +label in the PR if you are a committer and want to force dependency upgrade. + ## Canary run This workflow is triggered when a pull request is merged into the "main" @@ -184,6 +192,28 @@ the new branch and there are a few places where selection of tests is based on whether this output is `main`. They are marked as - in the "Release branches" column of the table below. +## Committer vs. non-committer PRs + +There is a difference in how the CI jobs are run for committer and non-committer PRs from forks. +Main reason is security - we do not want to run untrusted code on our infrastructure for self-hosted runners, +but also we do not want to run unverified code during the `Build imaage` workflow, because that workflow has +access to GITHUB_TOKEN that has access to write to the Github Registry of ours (which is used to cache +images between runs). Also those images are build on self-hosted runners and we have to make sure that +those runners are not used to (fore example) mine cryptocurrencies on behalf of the person who opened the +pull request from their newly opened fork of airflow. + +This is why the `Build Images` workflow checks if the actor of the PR (GITHUB_ACTOR) is one of the committers, +and if not, then workflows and scripts used to run image building are coming only from the ``target`` branch +of the repository, where such scripts were reviewed and approved by the committers before being merged. + +This is controlled by `Selective checks <04_selective_checks.md>`__ that set appropriate output in +the build-info job of the workflow (see`is-committer-build` to `true`) if the actor is in the committer's +list and can be overridden by `non committer build` label in the PR. + +Also, for most of the jobs, committer builds by default use "Self-hosted" runners, while non-committer +builds use "Public" runners. For committers, this can be overridden by setting the +`use public runners` label in the PR. + ## Tests Workflow This workflow is a regular workflow that performs all checks of Airflow diff --git a/dev/breeze/src/airflow_breeze/commands/ci_commands.py b/dev/breeze/src/airflow_breeze/commands/ci_commands.py index 19e23be5f7419..3122cd5c1ee06 100644 --- a/dev/breeze/src/airflow_breeze/commands/ci_commands.py +++ b/dev/breeze/src/airflow_breeze/commands/ci_commands.py @@ -30,6 +30,7 @@ import click +from airflow_breeze.branch_defaults import AIRFLOW_BRANCH, DEFAULT_AIRFLOW_CONSTRAINTS_BRANCH from airflow_breeze.commands.common_options import ( option_answer, option_dry_run, @@ -197,14 +198,14 @@ def get_changed_files(commit_ref: str | None) -> tuple[str, ...]: @click.option( "--default-branch", help="Branch against which the PR should be run", - default="main", + default=AIRFLOW_BRANCH, envvar="DEFAULT_BRANCH", show_default=True, ) @click.option( "--default-constraints-branch", help="Constraints Branch against which the PR should be run", - default="constraints-main", + default=DEFAULT_AIRFLOW_CONSTRAINTS_BRANCH, envvar="DEFAULT_CONSTRAINTS_BRANCH", show_default=True, ) diff --git a/dev/breeze/src/airflow_breeze/params/common_build_params.py b/dev/breeze/src/airflow_breeze/params/common_build_params.py index 421bcc7c3a947..864b156691279 100644 --- a/dev/breeze/src/airflow_breeze/params/common_build_params.py +++ b/dev/breeze/src/airflow_breeze/params/common_build_params.py @@ -46,10 +46,8 @@ class CommonBuildParams: additional_dev_apt_env: str | None = None additional_python_deps: str | None = None additional_pip_install_flags: str | None = None - airflow_branch: str = os.environ.get("DEFAULT_BRANCH", AIRFLOW_BRANCH) - default_constraints_branch: str = os.environ.get( - "DEFAULT_CONSTRAINTS_BRANCH", DEFAULT_AIRFLOW_CONSTRAINTS_BRANCH - ) + airflow_branch: str = AIRFLOW_BRANCH + default_constraints_branch: str = DEFAULT_AIRFLOW_CONSTRAINTS_BRANCH airflow_constraints_location: str | None = None builder: str = "autodetect" build_progress: str = ALLOWED_BUILD_PROGRESS[0] diff --git a/dev/breeze/src/airflow_breeze/params/shell_params.py b/dev/breeze/src/airflow_breeze/params/shell_params.py index 24203bdbff5c8..b0666ebf4d0d1 100644 --- a/dev/breeze/src/airflow_breeze/params/shell_params.py +++ b/dev/breeze/src/airflow_breeze/params/shell_params.py @@ -115,7 +115,7 @@ class ShellParams: Shell parameters. Those parameters are used to determine command issued to run shell command. """ - airflow_branch: str = os.environ.get("DEFAULT_BRANCH", AIRFLOW_BRANCH) + airflow_branch: str = AIRFLOW_BRANCH airflow_constraints_location: str = "" airflow_constraints_mode: str = ALLOWED_CONSTRAINTS_MODES_CI[0] airflow_constraints_reference: str = "" @@ -130,9 +130,7 @@ class ShellParams: collect_only: bool = False database_isolation: bool = False db_reset: bool = False - default_constraints_branch: str = os.environ.get( - "DEFAULT_CONSTRAINTS_BRANCH", DEFAULT_AIRFLOW_CONSTRAINTS_BRANCH - ) + default_constraints_branch: str = DEFAULT_AIRFLOW_CONSTRAINTS_BRANCH dev_mode: bool = False docker_host: str | None = os.environ.get("DOCKER_HOST") downgrade_sqlalchemy: bool = False diff --git a/dev/breeze/src/airflow_breeze/utils/selective_checks.py b/dev/breeze/src/airflow_breeze/utils/selective_checks.py index 93a356d29c46f..1cdeb88536023 100644 --- a/dev/breeze/src/airflow_breeze/utils/selective_checks.py +++ b/dev/breeze/src/airflow_breeze/utils/selective_checks.py @@ -24,6 +24,7 @@ from functools import cached_property, lru_cache from typing import Any, Dict, List, TypeVar +from airflow_breeze.branch_defaults import AIRFLOW_BRANCH, DEFAULT_AIRFLOW_CONSTRAINTS_BRANCH from airflow_breeze.global_constants import ( ALL_PYTHON_MAJOR_MINOR_VERSIONS, APACHE_AIRFLOW_GITHUB_REPOSITORY, @@ -62,6 +63,7 @@ FULL_TESTS_NEEDED_LABEL = "full tests needed" DEBUG_CI_RESOURCES_LABEL = "debug ci resources" USE_PUBLIC_RUNNERS_LABEL = "use public runners" +NON_COMMITTER_BUILD_LABEL = "non committer build" UPGRADE_TO_NEWER_DEPENDENCIES_LABEL = "upgrade to newer dependencies" ALL_CI_SELECTIVE_TEST_TYPES = ( @@ -350,8 +352,8 @@ class SelectiveChecks: def __init__( self, files: tuple[str, ...] = (), - default_branch="main", - default_constraints_branch="constraints-main", + default_branch=AIRFLOW_BRANCH, + default_constraints_branch=DEFAULT_AIRFLOW_CONSTRAINTS_BRANCH, commit_ref: str | None = None, pr_labels: tuple[str, ...] = (), github_event: GithubEvents = GithubEvents.PULL_REQUEST, @@ -1039,3 +1041,9 @@ def providers_compatibility_checks(self) -> str: if check["python-version"] in self.python_versions ] ) + + @cached_property + def is_committer_build(self): + if NON_COMMITTER_BUILD_LABEL in self._pr_labels: + return False + return self._github_actor in COMMITTERS diff --git a/dev/breeze/tests/test_selective_checks.py b/dev/breeze/tests/test_selective_checks.py index ff9d8f9472707..f3b48c622407a 100644 --- a/dev/breeze/tests/test_selective_checks.py +++ b/dev/breeze/tests/test_selective_checks.py @@ -1719,3 +1719,52 @@ def test_mypy_matches( pr_labels=pr_labels, ) assert_outputs_are_printed(expected_outputs, str(stderr)) + + +@pytest.mark.parametrize( + "files, expected_outputs, github_actor, pr_labels", + [ + pytest.param( + ("README.md",), + { + "is-committer-build": "false", + "runs-on": '["ubuntu-22.04"]', + }, + "", + (), + id="Regular pr", + ), + pytest.param( + ("README.md",), + { + "is-committer-build": "true", + "runs-on": '["self-hosted", "Linux", "X64"]', + }, + "potiuk", + (), + id="Committer regular PR", + ), + pytest.param( + ("README.md",), + { + "is-committer-build": "false", + "runs-on": '["self-hosted", "Linux", "X64"]', + }, + "potiuk", + ("non committer build",), + id="Committer regular PR - forcing non-committer build", + ), + ], +) +def test_pr_labels( + files: tuple[str, ...], expected_outputs: dict[str, str], github_actor: str, pr_labels: tuple[str, ...] +): + stderr = SelectiveChecks( + files=files, + commit_ref="HEAD", + default_branch="main", + github_actor=github_actor, + github_event=GithubEvents.PULL_REQUEST, + pr_labels=pr_labels, + ) + assert_outputs_are_printed(expected_outputs, str(stderr)) diff --git a/scripts/in_container/install_airflow_and_providers.py b/scripts/in_container/install_airflow_and_providers.py index d0b79b417b6ef..7c7ee6063d8c4 100755 --- a/scripts/in_container/install_airflow_and_providers.py +++ b/scripts/in_container/install_airflow_and_providers.py @@ -335,8 +335,7 @@ def find_installation_spec( ) @click.option( "--default-constraints-branch", - default="constraints-main", - show_default=True, + required=True, envvar="DEFAULT_CONSTRAINTS_BRANCH", help="Default constraints branch to use", ) diff --git a/scripts/in_container/run_generate_constraints.py b/scripts/in_container/run_generate_constraints.py index 99080fa3eb02d..9a77b178144ad 100755 --- a/scripts/in_container/run_generate_constraints.py +++ b/scripts/in_container/run_generate_constraints.py @@ -30,10 +30,11 @@ from click import Choice from in_container_utils import click, console, run_command -AIRFLOW_SOURCES = Path(__file__).resolve().parents[2] +AIRFLOW_SOURCE_DIR = Path(__file__).resolve().parents[2] + DEFAULT_BRANCH = os.environ.get("DEFAULT_BRANCH", "main") PYTHON_VERSION = os.environ.get("PYTHON_MAJOR_MINOR_VERSION", "3.8") -GENERATED_PROVIDER_DEPENDENCIES_FILE = AIRFLOW_SOURCES / "generated" / "provider_dependencies.json" +GENERATED_PROVIDER_DEPENDENCIES_FILE = AIRFLOW_SOURCE_DIR / "generated" / "provider_dependencies.json" ALL_PROVIDER_DEPENDENCIES = json.loads(GENERATED_PROVIDER_DEPENDENCIES_FILE.read_text()) @@ -145,7 +146,7 @@ def install_local_airflow_with_eager_upgrade( "eager", ], github_actions=config_params.github_actions, - cwd=AIRFLOW_SOURCES, + cwd=AIRFLOW_SOURCE_DIR, check=True, ) @@ -240,7 +241,7 @@ def uninstall_all_packages(config_params: ConfigParams): result = run_command( ["pip", "freeze"], github_actions=config_params.github_actions, - cwd=AIRFLOW_SOURCES, + cwd=AIRFLOW_SOURCE_DIR, text=True, check=True, capture_output=True, @@ -253,7 +254,7 @@ def uninstall_all_packages(config_params: ConfigParams): run_command( ["pip", "uninstall", "--root-user-action", "ignore", "-y", *all_installed_packages], github_actions=config_params.github_actions, - cwd=AIRFLOW_SOURCES, + cwd=AIRFLOW_SOURCE_DIR, text=True, check=True, ) @@ -396,8 +397,7 @@ def generate_constraints_no_providers(config_params: ConfigParams) -> None: ) @click.option( "--default-constraints-branch", - default="constraints-main", - show_default=True, + required=True, envvar="DEFAULT_CONSTRAINTS_BRANCH", help="Branch to get constraints from", )