Skip to content

Commit

Permalink
Merge pull request #128 from appuio/fix/loki-ingester-stuck
Browse files Browse the repository at this point in the history
Workaround for stuck loki-ingester pods
  • Loading branch information
DebakelOrakel authored Apr 10, 2024
2 parents 84b2501 + 1f62f51 commit 87e07a1
Show file tree
Hide file tree
Showing 7 changed files with 660 additions and 23 deletions.
11 changes: 11 additions & 0 deletions class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,17 @@ parameters:
limits:
memory: 1.5Gi

images:
kubectl:
registry: quay.io
repository: appuio/oc
tag: v4.14

workaround:
ingester_fix:
schedule: '0,30 * * * *'
sleep_time: 2m

openshift4_elasticsearch_operator:
targetNamespaces:
- ${openshift4_logging:namespace}
Expand Down
26 changes: 3 additions & 23 deletions component/loki.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ local com = import 'lib/commodore.libjsonnet';
local kap = import 'lib/kapitan.libjsonnet';
local kube = import 'lib/kube.libjsonnet';
local po = import 'lib/patch-operator.libsonnet';
local workaround = import 'loki_workaround.libsonnet';

// The hiera parameters for the component
local inv = kap.inventory();
Expand Down Expand Up @@ -116,36 +117,15 @@ local aggregate_loki_log_access = kube.ClusterRole('syn:loki:cluster-reader') {
],
};

// Generate missing metrics SA token for Loki Operator.
//
// The ServiceMonitor for the Loki Operator references a SA token secret
// called `loki-operator-controller-manager-metrics-token` which doesn't exist
// on the cluster after the operator is installed or upgraded to 5.8.5 via
// OLM.
local operator_metrics_sa_token =
kube.Secret('loki-operator-controller-manager-metrics-token') {
metadata+: {
// Loki operator is deployed in openshift-operators-redhat
namespace: 'openshift-operators-redhat',
annotations+: {
'kubernetes.io/service-account.name': 'loki-operator-controller-manager-metrics-reader',
// disable argocd prune/delete so removing the workaround should be
// fairly easy in case the Loki Operator OLM install fixes the issue.
'argocd.argoproj.io/sync-options': 'Prune=false,Delete=false',
},
},
data:: {},
type: 'kubernetes.io/service-account-token',
};

// Define outputs below
if loki.enabled then
{
'50_loki_stack': lokistack,
'50_loki_logstore': logstore,
'50_loki_netpol': [ netpol_viewplugin, netpol_lokigateway ],
'50_loki_rbac': [ aggregate_loki_log_access ],
'50_loki_operator_metrics_token': [ operator_metrics_sa_token ],
'50_loki_operator_metrics_token': workaround.missing_metrics_token,
'50_loki_ingester_fix': workaround.ingester_stuck,
}
else
std.trace(
Expand Down
137 changes: 137 additions & 0 deletions component/loki_workaround.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
local com = import 'lib/commodore.libjsonnet';
local kap = import 'lib/kapitan.libjsonnet';
local kube = import 'lib/kube.libjsonnet';
local po = import 'lib/patch-operator.libsonnet';

// The hiera parameters for the component
local inv = kap.inventory();
local params = inv.parameters.openshift4_logging;


// Generate missing metrics SA token for Loki Operator.
//
// The ServiceMonitor for the Loki Operator references a SA token secret
// called `loki-operator-controller-manager-metrics-token` which doesn't exist
// on the cluster after the operator is installed or upgraded to 5.8.5 via
// OLM.
local missing_metrics_token =
kube.Secret('loki-operator-controller-manager-metrics-token') {
metadata+: {
// Loki operator is deployed in openshift-operators-redhat
namespace: 'openshift-operators-redhat',
annotations+: {
'kubernetes.io/service-account.name': 'loki-operator-controller-manager-metrics-reader',
// disable argocd prune/delete so removing the workaround should be
// fairly easy in case the Loki Operator OLM install fixes the issue.
'argocd.argoproj.io/sync-options': 'Prune=false,Delete=false',
},
},
data:: {},
type: 'kubernetes.io/service-account-token',
};


// Workaround for stuck loki-ingester.
// To be removed, once upstream is fixed.

local ingester_stuck = [
kube.ServiceAccount('loki-ingester-check') {
metadata+: {
namespace: params.namespace,
},
},
kube.Role('loki-ingester-check') {
metadata+: {
namespace: params.namespace,
},
rules: [ {
apiGroups: [ '' ],
resources: [ 'pods', 'pods/exec' ],
verbs: [
'get',
'list',
'watch',
'create',
'delete',
'patch',
'update',
],
} ],
},
kube.RoleBinding('loki-ingester-check') {
metadata+: {
namespace: params.namespace,
},
roleRef: {
apiGroup: 'rbac.authorization.k8s.io',
kind: 'Role',
name: 'loki-ingester-check',
},
subjects: [ {
kind: 'ServiceAccount',
name: 'loki-ingester-check',
} ],
},
kube.ConfigMap('loki-ingester-check') {
metadata+: {
namespace: params.namespace,
},
data: {
'wal-check.sh': importstr 'workaround-scripts/wal-check.sh',
},
},
kube.CronJob('loki-ingester-check') {
metadata+: {
namespace: params.namespace,
},
spec: {
schedule: params.workaround.ingester_fix.schedule,
concurrencyPolicy: 'Forbid',
failedJobsHistoryLimit: 0,
jobTemplate: {
spec: {
activeDeadlineSeconds: 360,
backoffLimit: 1,
template: {
spec: {
containers: [ {
name: 'check-pod',
image: '%(registry)s/%(repository)s:%(tag)s' % params.images.kubectl,
imagePullPolicy: 'IfNotPresent',
command: [ '/usr/local/bin/wal-check.sh' ],
env: [ {
name: 'SLEEP_TIME',
value: params.workaround.ingester_fix.sleep_time,
} ],
ports: [],
stdin: false,
tty: false,
volumeMounts: [ {
mountPath: '/usr/local/bin/wal-check.sh',
name: 'wal-check',
readOnly: true,
subPath: 'wal-check.sh',
} ],
} ],
nodeSelector: { 'node-role.kubernetes.io/infra': '' },
restartPolicy: 'Never',
serviceAccountName: 'loki-ingester-check',
volumes: [ {
name: 'wal-check',
configMap: {
defaultMode: 364,
name: 'loki-ingester-check',
},
} ],
},
},
},
},
},
},
];

{
missing_metrics_token: [ missing_metrics_token ],
ingester_stuck: ingester_stuck,
}
50 changes: 50 additions & 0 deletions component/workaround-scripts/wal-check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash

set -e -o pipefail

# Check if pod is in stuck state.
function check_pod() {
POD_NAME="loki-ingester-${1}"
echo "checking POD ${POD_NAME}"
PHASE=$(kubectl -n openshift-logging get po ${POD_NAME} -oyaml | yq '.status.phase')
if [ ${PHASE} != "Running" ]; then
return 0
fi
READY=$(kubectl -n openshift-logging get po ${POD_NAME} -oyaml | yq '.status.conditions[] | select(.type == "ContainersReady") | .status')
if [ ${READY} == "True" ]; then
return 0
fi
return 1
}

# Check directories of pod and remove non-existing checkpoint if present.
function check_dir() {
shopt -s extglob
POD_NAME="loki-ingester-${1}"
echo "checking DIR ${POD_NAME}"
DIR_CHP=$(kubectl -n openshift-logging exec -i ${POD_NAME} -- ls /tmp/wal | grep -o "^checkpoint\.[0-9]*$")
PATTERN=$(echo ${DIR_CHP} | sed 's/[^0-9]*//g')
DIR_WAL=$(kubectl -n openshift-logging exec -i ${POD_NAME} -- ls /tmp/wal | grep -o "^0*${PATTERN}$" || exit 0)
if [ -z $DIR_WAL ]; then
kubectl -n openshift-logging exec -i ${POD_NAME} -- rm -rf /tmp/wal/${DIR_CHP}
kubectl -n openshift-logging delete po ${POD_NAME}
fi
}

# Check if pods are in stuck state for longer than ${SLEEP_TIME}.
# Only fix 1 pod at a time and immediatly exit if it is fixed.
function fix_pod() {
if ! check_pod $1; then
echo "stuck POD, waiting ${SLEEP_TIME}"
sleep ${SLEEP_TIME}
if ! check_pod $1; then
check_dir $1
exit 0
fi
fi
}

fix_pod 0
fix_pod 1

exit 0
Loading

0 comments on commit 87e07a1

Please sign in to comment.