Skip to content

Commit

Permalink
Simplify DataScienceCluster conditions
Browse files Browse the repository at this point in the history
This commit aims to simpplify the DataScienceCluster status conditions
by removing transient conditions (Reconciling, Deploying, etc) and
replacing them with two conditsion:

- Available that is used to reflects the work done by the controller, so
  an Available condition with status False would signal that the
  controller encountered some troubles while reconciling the status of
  the cluster
- Ready that is used to reflect the status of the component managed by
  the controller.

It is perfectly possible to be in a state where Available=False and
Ready=True which can happen in case the controller has troubles to
perform some reconciliation logic because of i.e. a transient kubernetes
error, but at the same time all the components are running.

The object would then result in soimething like:

    apiVersion: datasciencecluster.opendatahub.io/v1
    kind: DataScienceCluster
    metadata:
        name: default-dsc
    spec: {}
    status:
        conditions:
        - lastHeartbeatTime: "2025-01-15T12:54:20Z"
            lastTransitionTime: "2025-01-15T12:54:20Z"
            message: DataScienceCluster resource reconciled successfully
            reason: Available
            status: "True"
            type: Available
        - lastHeartbeatTime: "2025-01-15T12:54:20Z"
            lastTransitionTime: "2025-01-15T12:54:20Z"
            message: Ready
            reason: Ready
            status: "True"
            type: Ready

It is possible to wait for the DataScienceCluster to be ready by using
the kubectl wait command:

    kubectl wait --for=condition=ready datascienceclusters.datasciencecluster.opendatahub.io default-dsc
  • Loading branch information
lburgazzoli committed Jan 15, 2025
1 parent ad20d38 commit b40b2d8
Show file tree
Hide file tree
Showing 22 changed files with 734 additions and 153 deletions.
2 changes: 1 addition & 1 deletion controllers/components/codeflare/codeflare.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
type componentHandler struct{}

func init() { //nolint:gochecknoinits
cr.Add(&componentHandler{})
cr.R.Add(&componentHandler{})
}

func (s *componentHandler) GetName() string {
Expand Down
2 changes: 1 addition & 1 deletion controllers/components/dashboard/dashboard.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
type componentHandler struct{}

func init() { //nolint:gochecknoinits
cr.Add(&componentHandler{})
cr.R.Add(&componentHandler{})
}

func (s *componentHandler) GetName() string {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
type componentHandler struct{}

func init() { //nolint:gochecknoinits
cr.Add(&componentHandler{})
cr.R.Add(&componentHandler{})
}

func (s *componentHandler) GetName() string {
Expand Down
2 changes: 1 addition & 1 deletion controllers/components/kserve/kserve.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ const (
type componentHandler struct{}

func init() { //nolint:gochecknoinits
cr.Add(&componentHandler{})
cr.R.Add(&componentHandler{})
}

// Init for set images.
Expand Down
2 changes: 1 addition & 1 deletion controllers/components/kueue/kueue.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
type componentHandler struct{}

func init() { //nolint:gochecknoinits
cr.Add(&componentHandler{})
cr.R.Add(&componentHandler{})
}

func (s *componentHandler) GetName() string {
Expand Down
4 changes: 2 additions & 2 deletions controllers/components/modelcontroller/modelcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@ import (
dscv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/datasciencecluster/v1"
"github.com/opendatahub-io/opendatahub-operator/v2/controllers/status"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/componentsregistry"
cr "github.com/opendatahub-io/opendatahub-operator/v2/pkg/componentsregistry"
odhdeploy "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/metadata/annotations"
)

type componentHandler struct{}

func init() { //nolint:gochecknoinits
componentsregistry.Add(&componentHandler{})
cr.R.Add(&componentHandler{})
}

func (s *componentHandler) GetName() string {
Expand Down
4 changes: 2 additions & 2 deletions controllers/components/modelmeshserving/modelmeshserving.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@ import (
dscv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/datasciencecluster/v1"
"github.com/opendatahub-io/opendatahub-operator/v2/controllers/status"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/componentsregistry"
cr "github.com/opendatahub-io/opendatahub-operator/v2/pkg/componentsregistry"
odhdeploy "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/metadata/annotations"
)

type componentHandler struct{}

func init() { //nolint:gochecknoinits
componentsregistry.Add(&componentHandler{})
cr.R.Add(&componentHandler{})
}

func (s *componentHandler) GetName() string {
Expand Down
2 changes: 1 addition & 1 deletion controllers/components/modelregistry/modelregistry.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
type componentHandler struct{}

func init() { //nolint:gochecknoinits
cr.Add(&componentHandler{})
cr.R.Add(&componentHandler{})
}

func (s *componentHandler) GetName() string {
Expand Down
2 changes: 1 addition & 1 deletion controllers/components/ray/ray.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
type componentHandler struct{}

func init() { //nolint:gochecknoinits
cr.Add(&componentHandler{})
cr.R.Add(&componentHandler{})
}

func (s *componentHandler) GetName() string {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
type componentHandler struct{}

func init() { //nolint:gochecknoinits
cr.Add(&componentHandler{})
cr.R.Add(&componentHandler{})
}

func (s *componentHandler) GetName() string {
Expand Down
2 changes: 1 addition & 1 deletion controllers/components/trustyai/trustyai.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
type componentHandler struct{}

func init() { //nolint:gochecknoinits
cr.Add(&componentHandler{})
cr.R.Add(&componentHandler{})
}

func (s *componentHandler) GetName() string {
Expand Down
2 changes: 1 addition & 1 deletion controllers/components/workbenches/workbenches.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
type componentHandler struct{}

func init() { //nolint:gochecknoinits
cr.Add(&componentHandler{})
cr.R.Add(&componentHandler{})
}

func (s *componentHandler) GetName() string {
Expand Down
135 changes: 9 additions & 126 deletions controllers/datasciencecluster/datasciencecluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,9 @@ import (
"slices"
"strings"

operatorv1 "github.com/openshift/api/operator/v1"
conditionsv1 "github.com/openshift/custom-resource-status/conditions/v1"
corev1 "k8s.io/api/core/v1"
k8serr "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
Expand All @@ -39,12 +36,10 @@ import (
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/opendatahub-io/opendatahub-operator/v2/apis/common"
componentApi "github.com/opendatahub-io/opendatahub-operator/v2/apis/components/v1alpha1"
dscv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/datasciencecluster/v1"
dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1"
"github.com/opendatahub-io/opendatahub-operator/v2/controllers/status"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster"
cr "github.com/opendatahub-io/opendatahub-operator/v2/pkg/componentsregistry"
odhClient "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/client"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/handlers"
Expand Down Expand Up @@ -97,14 +92,18 @@ func (r *DataScienceClusterReconciler) Reconcile(ctx context.Context, req ctrl.R
// validate pre-requisites
if err := r.validate(ctx, instance); err != nil {
log.Info(err.Error())
status.SetCondition(&instance.Status.Conditions, "Degraded", status.ReconcileFailed, err.Error(), corev1.ConditionTrue)

status.SetCondition(
&instance.Status.Conditions,
string(conditionsv1.ConditionAvailable),
status.DegradedReason,
err.Error(),
corev1.ConditionFalse,
)
}

// deploy components
if err := r.reconcileComponents(ctx, instance); err != nil {
log.Info(err.Error())
status.SetCondition(&instance.Status.Conditions, "Degraded", status.ReconcileFailed, err.Error(), corev1.ConditionTrue)
}
reconcileComponents(ctx, r.Client, instance, cr.R)

// keep conditions sorted
slices.SortFunc(instance.Status.Conditions, func(a, b conditionsv1.Condition) int {
Expand Down Expand Up @@ -150,122 +149,6 @@ func (r *DataScienceClusterReconciler) validate(ctx context.Context, _ *dscv1.Da
return nil
}

func (r *DataScienceClusterReconciler) reconcileComponents(ctx context.Context, instance *dscv1.DataScienceCluster) error {
log := logf.FromContext(ctx).WithName("DataScienceCluster")

notReadyComponents := make([]string, 0)

// all DSC defined components
componentErrors := cr.ForEach(func(component cr.ComponentHandler) error {
ci, err := r.reconcileComponent(ctx, instance, component)
if err != nil {
return err
}

if !cr.IsManaged(component, instance) {
return nil
}

if !meta.IsStatusConditionTrue(ci.GetStatus().Conditions, status.ConditionTypeReady) {
notReadyComponents = append(notReadyComponents, component.GetName())
}

return nil
})

// Process errors for components
if componentErrors != nil {
log.Info("DataScienceCluster Deployment Incomplete.")

status.SetCompleteCondition(
&instance.Status.Conditions,
status.ReconcileCompletedWithComponentErrors,
fmt.Sprintf("DataScienceCluster resource reconciled with component errors: %v", componentErrors),
)

r.Recorder.Eventf(instance, corev1.EventTypeNormal,
"DataScienceClusterComponentFailures",
"DataScienceCluster instance %s created, but have some failures in component %v", instance.Name, componentErrors)
} else {
log.Info("DataScienceCluster Deployment Completed.")

// finalize reconciliation
status.SetCompleteCondition(
&instance.Status.Conditions,
status.ReconcileCompleted,
"DataScienceCluster resource reconciled successfully",
)
}

if len(notReadyComponents) != 0 {
instance.Status.Phase = status.PhaseNotReady

conditionsv1.SetStatusCondition(&instance.Status.Conditions, conditionsv1.Condition{
Type: conditionsv1.ConditionType(status.ConditionTypeReady),
Status: corev1.ConditionFalse,
Reason: "NotReady",
Message: fmt.Sprintf("Some components are not ready: %s", strings.Join(notReadyComponents, ",")),
})
} else {
instance.Status.Phase = status.PhaseReady

conditionsv1.SetStatusCondition(&instance.Status.Conditions, conditionsv1.Condition{
Type: conditionsv1.ConditionType(status.ConditionTypeReady),
Status: corev1.ConditionTrue,
Reason: "Ready",
Message: "Ready",
})
}

instance.Status.Release = cluster.GetRelease()
instance.Status.ObservedGeneration = instance.Generation

if componentErrors != nil {
return componentErrors
}

return nil
}

func (r *DataScienceClusterReconciler) reconcileComponent(
ctx context.Context,
instance *dscv1.DataScienceCluster,
component cr.ComponentHandler,
) (common.PlatformObject, error) {
ms := component.GetManagementState(instance)
componentCR := component.NewCRObject(instance)

switch ms {
case operatorv1.Managed:
err := ctrl.SetControllerReference(instance, componentCR, r.Scheme)
if err != nil {
return nil, err
}
err = r.Client.Apply(ctx, componentCR, client.FieldOwner(fieldOwner), client.ForceOwnership)
if err != nil {
return nil, err
}
case operatorv1.Removed:
err := r.Client.Delete(ctx, componentCR, client.PropagationPolicy(metav1.DeletePropagationForeground))
if err != nil && !k8serr.IsNotFound(err) {
return nil, err
}
default:
return nil, fmt.Errorf("unsupported management state: %s", ms)
}

if instance.Status.InstalledComponents == nil {
instance.Status.InstalledComponents = make(map[string]bool)
}

err := component.UpdateDSCStatus(instance, componentCR)
if err != nil {
return nil, fmt.Errorf("failed to update status of DataScienceCluster component %s: %w", component.GetName(), err)
}

return componentCR, nil
}

func (r *DataScienceClusterReconciler) reportError(ctx context.Context, err error, instance *dscv1.DataScienceCluster, message string) {
logf.FromContext(ctx).Error(err, message, "instance.Name", instance.Name)
r.Recorder.Eventf(instance, corev1.EventTypeWarning, "DataScienceClusterReconcileError",
Expand Down
Loading

0 comments on commit b40b2d8

Please sign in to comment.