-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #128 from appuio/fix/loki-ingester-stuck
Workaround for stuck loki-ingester pods
- Loading branch information
Showing
7 changed files
with
660 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
local com = import 'lib/commodore.libjsonnet'; | ||
local kap = import 'lib/kapitan.libjsonnet'; | ||
local kube = import 'lib/kube.libjsonnet'; | ||
local po = import 'lib/patch-operator.libsonnet'; | ||
|
||
// The hiera parameters for the component | ||
local inv = kap.inventory(); | ||
local params = inv.parameters.openshift4_logging; | ||
|
||
|
||
// Generate missing metrics SA token for Loki Operator. | ||
// | ||
// The ServiceMonitor for the Loki Operator references a SA token secret | ||
// called `loki-operator-controller-manager-metrics-token` which doesn't exist | ||
// on the cluster after the operator is installed or upgraded to 5.8.5 via | ||
// OLM. | ||
local missing_metrics_token = | ||
kube.Secret('loki-operator-controller-manager-metrics-token') { | ||
metadata+: { | ||
// Loki operator is deployed in openshift-operators-redhat | ||
namespace: 'openshift-operators-redhat', | ||
annotations+: { | ||
'kubernetes.io/service-account.name': 'loki-operator-controller-manager-metrics-reader', | ||
// disable argocd prune/delete so removing the workaround should be | ||
// fairly easy in case the Loki Operator OLM install fixes the issue. | ||
'argocd.argoproj.io/sync-options': 'Prune=false,Delete=false', | ||
}, | ||
}, | ||
data:: {}, | ||
type: 'kubernetes.io/service-account-token', | ||
}; | ||
|
||
|
||
// Workaround for stuck loki-ingester. | ||
// To be removed, once upstream is fixed. | ||
|
||
local ingester_stuck = [ | ||
kube.ServiceAccount('loki-ingester-check') { | ||
metadata+: { | ||
namespace: params.namespace, | ||
}, | ||
}, | ||
kube.Role('loki-ingester-check') { | ||
metadata+: { | ||
namespace: params.namespace, | ||
}, | ||
rules: [ { | ||
apiGroups: [ '' ], | ||
resources: [ 'pods', 'pods/exec' ], | ||
verbs: [ | ||
'get', | ||
'list', | ||
'watch', | ||
'create', | ||
'delete', | ||
'patch', | ||
'update', | ||
], | ||
} ], | ||
}, | ||
kube.RoleBinding('loki-ingester-check') { | ||
metadata+: { | ||
namespace: params.namespace, | ||
}, | ||
roleRef: { | ||
apiGroup: 'rbac.authorization.k8s.io', | ||
kind: 'Role', | ||
name: 'loki-ingester-check', | ||
}, | ||
subjects: [ { | ||
kind: 'ServiceAccount', | ||
name: 'loki-ingester-check', | ||
} ], | ||
}, | ||
kube.ConfigMap('loki-ingester-check') { | ||
metadata+: { | ||
namespace: params.namespace, | ||
}, | ||
data: { | ||
'wal-check.sh': importstr 'workaround-scripts/wal-check.sh', | ||
}, | ||
}, | ||
kube.CronJob('loki-ingester-check') { | ||
metadata+: { | ||
namespace: params.namespace, | ||
}, | ||
spec: { | ||
schedule: params.workaround.ingester_fix.schedule, | ||
concurrencyPolicy: 'Forbid', | ||
failedJobsHistoryLimit: 0, | ||
jobTemplate: { | ||
spec: { | ||
activeDeadlineSeconds: 360, | ||
backoffLimit: 1, | ||
template: { | ||
spec: { | ||
containers: [ { | ||
name: 'check-pod', | ||
image: '%(registry)s/%(repository)s:%(tag)s' % params.images.kubectl, | ||
imagePullPolicy: 'IfNotPresent', | ||
command: [ '/usr/local/bin/wal-check.sh' ], | ||
env: [ { | ||
name: 'SLEEP_TIME', | ||
value: params.workaround.ingester_fix.sleep_time, | ||
} ], | ||
ports: [], | ||
stdin: false, | ||
tty: false, | ||
volumeMounts: [ { | ||
mountPath: '/usr/local/bin/wal-check.sh', | ||
name: 'wal-check', | ||
readOnly: true, | ||
subPath: 'wal-check.sh', | ||
} ], | ||
} ], | ||
nodeSelector: { 'node-role.kubernetes.io/infra': '' }, | ||
restartPolicy: 'Never', | ||
serviceAccountName: 'loki-ingester-check', | ||
volumes: [ { | ||
name: 'wal-check', | ||
configMap: { | ||
defaultMode: 364, | ||
name: 'loki-ingester-check', | ||
}, | ||
} ], | ||
}, | ||
}, | ||
}, | ||
}, | ||
}, | ||
}, | ||
]; | ||
|
||
{ | ||
missing_metrics_token: [ missing_metrics_token ], | ||
ingester_stuck: ingester_stuck, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#!/bin/bash | ||
|
||
set -e -o pipefail | ||
|
||
# Check if pod is in stuck state. | ||
function check_pod() { | ||
POD_NAME="loki-ingester-${1}" | ||
echo "checking POD ${POD_NAME}" | ||
PHASE=$(kubectl -n openshift-logging get po ${POD_NAME} -oyaml | yq '.status.phase') | ||
if [ ${PHASE} != "Running" ]; then | ||
return 0 | ||
fi | ||
READY=$(kubectl -n openshift-logging get po ${POD_NAME} -oyaml | yq '.status.conditions[] | select(.type == "ContainersReady") | .status') | ||
if [ ${READY} == "True" ]; then | ||
return 0 | ||
fi | ||
return 1 | ||
} | ||
|
||
# Check directories of pod and remove non-existing checkpoint if present. | ||
function check_dir() { | ||
shopt -s extglob | ||
POD_NAME="loki-ingester-${1}" | ||
echo "checking DIR ${POD_NAME}" | ||
DIR_CHP=$(kubectl -n openshift-logging exec -i ${POD_NAME} -- ls /tmp/wal | grep -o "^checkpoint\.[0-9]*$") | ||
PATTERN=$(echo ${DIR_CHP} | sed 's/[^0-9]*//g') | ||
DIR_WAL=$(kubectl -n openshift-logging exec -i ${POD_NAME} -- ls /tmp/wal | grep -o "^0*${PATTERN}$" || exit 0) | ||
if [ -z $DIR_WAL ]; then | ||
kubectl -n openshift-logging exec -i ${POD_NAME} -- rm -rf /tmp/wal/${DIR_CHP} | ||
kubectl -n openshift-logging delete po ${POD_NAME} | ||
fi | ||
} | ||
|
||
# Check if pods are in stuck state for longer than ${SLEEP_TIME}. | ||
# Only fix 1 pod at a time and immediatly exit if it is fixed. | ||
function fix_pod() { | ||
if ! check_pod $1; then | ||
echo "stuck POD, waiting ${SLEEP_TIME}" | ||
sleep ${SLEEP_TIME} | ||
if ! check_pod $1; then | ||
check_dir $1 | ||
exit 0 | ||
fi | ||
fi | ||
} | ||
|
||
fix_pod 0 | ||
fix_pod 1 | ||
|
||
exit 0 |
Oops, something went wrong.