-
Notifications
You must be signed in to change notification settings - Fork 48
159 lines (132 loc) · 5.96 KB
/
e2e_tests.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
name: e2e
on:
pull_request:
branches:
- main
- 'release-*'
paths-ignore:
- 'docs/**'
- '**.adoc'
- '**.md'
- 'LICENSE'
push:
branches:
- main
- 'release-*'
paths-ignore:
- 'docs/**'
- '**.adoc'
- '**.md'
- 'LICENSE'
concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
cancel-in-progress: true
jobs:
kubernetes-e2e:
runs-on: ubuntu-20.04-4core-gpu
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
submodules: recursive
- name: Checkout common repo code
uses: actions/checkout@v4
with:
repository: 'project-codeflare/codeflare-common'
ref: 'main'
path: 'common'
- name: Set Go
uses: actions/setup-go@v5
with:
go-version-file: './go.mod'
- name: Set up gotestfmt
uses: gotesttools/gotestfmt-action@v2
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Setup NVidia GPU environment for KinD
uses: ./common/github-actions/nvidia-gpu-setup
- name: Setup and start KinD cluster
id: kind-install
uses: ./common/github-actions/kind
- name: Install NVidia GPU operator for KinD
uses: ./common/github-actions/nvidia-gpu-operator
- name: Deploy CodeFlare stack
id: deploy
run: |
echo Setting up CodeFlare stack
make setup-e2e
echo Deploying CodeFlare operator
IMG=localhost/codeflare-operator:test
make image-build -e IMG="${IMG}"
podman save -o cfo.tar ${IMG}
kind load image-archive cfo.tar --name cluster --verbosity 1000
make deploy -e IMG="${IMG}" -e ENV="e2e"
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
- name: Add user to KinD
uses: ./common/github-actions/kind-add-user
with:
user-name: test-user
- name: Configure RBAC for test user to use namespaced admin role
run: |
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
kubectl create clusterrolebinding test-user-namespace-creator --clusterrole=namespace-creator --user=test-user
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
kubectl create clusterrolebinding test-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=test-user
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
kubectl create clusterrolebinding test-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=test-user
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
kubectl create clusterrolebinding test-user-localqueue-creator --clusterrole=localqueue-creator --user=test-user
kubectl create clusterrole event-creator --verb=get,list,create,delete,patch --resource=events.k8s.io
kubectl create clusterrolebinding test-user-event-creator --clusterrole=event-creator --user=test-user
kubectl create clusterrolebinding test-user-namespaced-admin --clusterrole=admin --user=test-user
- name: Switch to test-user for test execution
run: kubectl config use-context test-user
- name: Run e2e tests
run: |
export CODEFLARE_TEST_TIMEOUT_SHORT=3m
export CODEFLARE_TEST_TIMEOUT_MEDIUM=15m
export CODEFLARE_TEST_TIMEOUT_LONG=20m
export CODEFLARE_TEST_TIMEOUT_GPU_PROVISIONING=30m
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
set -euo pipefail
go test -timeout 60m -v -skip "^Test.*Cpu$" ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
- name: Switch to kind-cluster context to print logs
if: always() && steps.deploy.outcome == 'success'
run: kubectl config use-context kind-cluster
- name: Print CodeFlare operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing CodeFlare operator logs"
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log
- name: Print Kueue operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Kueue operator logs"
KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}')
kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log
- name: Print KubeRay operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing KubeRay operator logs"
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log
- name: Export all KinD pod logs
uses: ./common/github-actions/kind-export-logs
if: always() && steps.kind-install.outcome == 'success'
with:
output-directory: ${TEMP_DIR}
- name: Upload logs
uses: actions/upload-artifact@v4
if: always() && steps.kind-install.outcome == 'success'
with:
name: logs
retention-days: 10
path: |
${{ env.TEMP_DIR }}/**/*.log
- name: Post notification about failure to a Slack channel in case of push event
if: failure() && github.event_name == 'push'
uses: slackapi/[email protected]
with:
channel-id: "codeflare-nightlies"
slack-message: "e2e test on push failed, <https://github.com/project-codeflare/codeflare-operator/actions/workflows/e2e_tests.yaml|View workflow runs>"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}