Merge pull request #128 from appuio/fix/loki-ingester-stuck

Workaround for stuck loki-ingester pods
appuio · Apr 10, 2024 · 87e07a1 · 87e07a1
2 parents 84b2501 + 1f62f51
commit 87e07a1
Show file tree

Hide file tree

Showing 7 changed files with 660 additions and 23 deletions.
diff --git a/class/defaults.yml b/class/defaults.yml
@@ -122,6 +122,17 @@ parameters:
         limits:
           memory: 1.5Gi
 
+    images:
+      kubectl:
+        registry: quay.io
+        repository: appuio/oc
+        tag: v4.14
+
+    workaround:
+      ingester_fix:
+        schedule: '0,30 * * * *'
+        sleep_time: 2m
+
   openshift4_elasticsearch_operator:
     targetNamespaces:
       - ${openshift4_logging:namespace}

diff --git a/component/loki.libsonnet b/component/loki.libsonnet
@@ -3,6 +3,7 @@ local com = import 'lib/commodore.libjsonnet';
 local kap = import 'lib/kapitan.libjsonnet';
 local kube = import 'lib/kube.libjsonnet';
 local po = import 'lib/patch-operator.libsonnet';
+local workaround = import 'loki_workaround.libsonnet';
 
 // The hiera parameters for the component
 local inv = kap.inventory();
@@ -116,36 +117,15 @@ local aggregate_loki_log_access = kube.ClusterRole('syn:loki:cluster-reader') {
   ],
 };
 
-// Generate missing metrics SA token for Loki Operator.
-//
-// The ServiceMonitor for the Loki Operator references a SA token secret
-// called `loki-operator-controller-manager-metrics-token` which doesn't exist
-// on the cluster after the operator is installed or upgraded to 5.8.5 via
-// OLM.
-local operator_metrics_sa_token =
-  kube.Secret('loki-operator-controller-manager-metrics-token') {
-    metadata+: {
-      // Loki operator is deployed in openshift-operators-redhat
-      namespace: 'openshift-operators-redhat',
-      annotations+: {
-        'kubernetes.io/service-account.name': 'loki-operator-controller-manager-metrics-reader',
-        // disable argocd prune/delete so removing the workaround should be
-        // fairly easy in case the Loki Operator OLM install fixes the issue.
-        'argocd.argoproj.io/sync-options': 'Prune=false,Delete=false',
-      },
-    },
-    data:: {},
-    type: 'kubernetes.io/service-account-token',
-  };
-
 // Define outputs below
 if loki.enabled then
   {
     '50_loki_stack': lokistack,
     '50_loki_logstore': logstore,
     '50_loki_netpol': [ netpol_viewplugin, netpol_lokigateway ],
     '50_loki_rbac': [ aggregate_loki_log_access ],
-    '50_loki_operator_metrics_token': [ operator_metrics_sa_token ],
+    '50_loki_operator_metrics_token': workaround.missing_metrics_token,
+    '50_loki_ingester_fix': workaround.ingester_stuck,
   }
 else
   std.trace(

diff --git a/component/loki_workaround.libsonnet b/component/loki_workaround.libsonnet
@@ -0,0 +1,137 @@
+local com = import 'lib/commodore.libjsonnet';
+local kap = import 'lib/kapitan.libjsonnet';
+local kube = import 'lib/kube.libjsonnet';
+local po = import 'lib/patch-operator.libsonnet';
+
+// The hiera parameters for the component
+local inv = kap.inventory();
+local params = inv.parameters.openshift4_logging;
+
+
+// Generate missing metrics SA token for Loki Operator.
+//
+// The ServiceMonitor for the Loki Operator references a SA token secret
+// called `loki-operator-controller-manager-metrics-token` which doesn't exist
+// on the cluster after the operator is installed or upgraded to 5.8.5 via
+// OLM.
+local missing_metrics_token =
+  kube.Secret('loki-operator-controller-manager-metrics-token') {
+    metadata+: {
+      // Loki operator is deployed in openshift-operators-redhat
+      namespace: 'openshift-operators-redhat',
+      annotations+: {
+        'kubernetes.io/service-account.name': 'loki-operator-controller-manager-metrics-reader',
+        // disable argocd prune/delete so removing the workaround should be
+        // fairly easy in case the Loki Operator OLM install fixes the issue.
+        'argocd.argoproj.io/sync-options': 'Prune=false,Delete=false',
+      },
+    },
+    data:: {},
+    type: 'kubernetes.io/service-account-token',
+  };
+
+
+// Workaround for stuck loki-ingester.
+// To be removed, once upstream is fixed.
+
+local ingester_stuck = [
+  kube.ServiceAccount('loki-ingester-check') {
+    metadata+: {
+      namespace: params.namespace,
+    },
+  },
+  kube.Role('loki-ingester-check') {
+    metadata+: {
+      namespace: params.namespace,
+    },
+    rules: [ {
+      apiGroups: [ '' ],
+      resources: [ 'pods', 'pods/exec' ],
+      verbs: [
+        'get',
+        'list',
+        'watch',
+        'create',
+        'delete',
+        'patch',
+        'update',
+      ],
+    } ],
+  },
+  kube.RoleBinding('loki-ingester-check') {
+    metadata+: {
+      namespace: params.namespace,
+    },
+    roleRef: {
+      apiGroup: 'rbac.authorization.k8s.io',
+      kind: 'Role',
+      name: 'loki-ingester-check',
+    },
+    subjects: [ {
+      kind: 'ServiceAccount',
+      name: 'loki-ingester-check',
+    } ],
+  },
+  kube.ConfigMap('loki-ingester-check') {
+    metadata+: {
+      namespace: params.namespace,
+    },
+    data: {
+      'wal-check.sh': importstr 'workaround-scripts/wal-check.sh',
+    },
+  },
+  kube.CronJob('loki-ingester-check') {
+    metadata+: {
+      namespace: params.namespace,
+    },
+    spec: {
+      schedule: params.workaround.ingester_fix.schedule,
+      concurrencyPolicy: 'Forbid',
+      failedJobsHistoryLimit: 0,
+      jobTemplate: {
+        spec: {
+          activeDeadlineSeconds: 360,
+          backoffLimit: 1,
+          template: {
+            spec: {
+              containers: [ {
+                name: 'check-pod',
+                image: '%(registry)s/%(repository)s:%(tag)s' % params.images.kubectl,
+                imagePullPolicy: 'IfNotPresent',
+                command: [ '/usr/local/bin/wal-check.sh' ],
+                env: [ {
+                  name: 'SLEEP_TIME',
+                  value: params.workaround.ingester_fix.sleep_time,
+                } ],
+                ports: [],
+                stdin: false,
+                tty: false,
+                volumeMounts: [ {
+                  mountPath: '/usr/local/bin/wal-check.sh',
+                  name: 'wal-check',
+                  readOnly: true,
+                  subPath: 'wal-check.sh',
+                } ],
+              } ],
+              nodeSelector: { 'node-role.kubernetes.io/infra': '' },
+              restartPolicy: 'Never',
+              serviceAccountName: 'loki-ingester-check',
+              volumes: [ {
+                name: 'wal-check',
+                configMap: {
+                  defaultMode: 364,
+                  name: 'loki-ingester-check',
+                },
+              } ],
+            },
+          },
+        },
+      },
+    },
+  },
+];
+
+{
+  missing_metrics_token: [ missing_metrics_token ],
+  ingester_stuck: ingester_stuck,
+}
diff --git a/component/workaround-scripts/wal-check.sh b/component/workaround-scripts/wal-check.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+# Check if pod is in stuck state.
+function check_pod() {
+  POD_NAME="loki-ingester-${1}"
+  echo "checking POD ${POD_NAME}"
+  PHASE=$(kubectl -n openshift-logging get po ${POD_NAME} -oyaml | yq '.status.phase')
+  if [ ${PHASE} != "Running" ]; then
+    return 0
+  fi
+  READY=$(kubectl -n openshift-logging get po ${POD_NAME} -oyaml | yq '.status.conditions[] | select(.type == "ContainersReady") | .status')
+  if [ ${READY} == "True" ]; then
+    return 0
+  fi
+  return 1
+}
+
+# Check directories of pod and remove non-existing checkpoint if present.
+function check_dir() {
+  shopt -s extglob
+  POD_NAME="loki-ingester-${1}"
+  echo "checking DIR ${POD_NAME}"
+  DIR_CHP=$(kubectl -n openshift-logging exec -i ${POD_NAME} -- ls /tmp/wal | grep -o "^checkpoint\.[0-9]*$")
+  PATTERN=$(echo ${DIR_CHP} | sed 's/[^0-9]*//g')
+  DIR_WAL=$(kubectl -n openshift-logging exec -i ${POD_NAME} -- ls /tmp/wal | grep -o "^0*${PATTERN}$" || exit 0)
+  if [ -z $DIR_WAL ]; then
+    kubectl -n openshift-logging exec -i ${POD_NAME} -- rm -rf /tmp/wal/${DIR_CHP}
+    kubectl -n openshift-logging delete po ${POD_NAME}
+  fi
+}
+
+# Check if pods are in stuck state for longer than ${SLEEP_TIME}.
+# Only fix 1 pod at a time and immediatly exit if it is fixed.
+function fix_pod() {
+  if ! check_pod $1; then
+    echo "stuck POD, waiting ${SLEEP_TIME}"
+    sleep ${SLEEP_TIME}
+    if ! check_pod $1; then
+      check_dir $1
+      exit 0
+    fi
+  fi
+}
+
+fix_pod 0
+fix_pod 1
+
+exit 0