Skip to content

Commit

Permalink
Merge pull request #9207 from Icinga/bugfix/suppressed-state-notifica…
Browse files Browse the repository at this point in the history
…tions

Checkable: send state notifications after suppression if and only if the state differs compared to before the suppression started
  • Loading branch information
julianbrost authored Mar 7, 2022
2 parents 23a4640 + 90848f6 commit b4fd4c6
Show file tree
Hide file tree
Showing 14 changed files with 411 additions and 33 deletions.
37 changes: 37 additions & 0 deletions doc/19-technical-concepts.md
Original file line number Diff line number Diff line change
Expand Up @@ -1370,6 +1370,43 @@ Message updates will be dropped when:
* Checkable does not exist.
* Origin endpoint's zone is not allowed to access this checkable.
#### event::SetStateBeforeSuppression <a id="technical-concepts-json-rpc-messages-event-setstatebeforesuppression"></a>
> Location: `clusterevents.cpp`
##### Message Body
Key | Value
----------|---------------------------------
jsonrpc | 2.0
method | event::SetStateBeforeSuppression
params | Dictionary
##### Params
Key | Type | Description
---------------------------|--------|-----------------------------------------------
host | String | Host name
service | String | Service name
state\_before\_suppression | Number | Checkable state before the current suppression
##### Functions
Event Sender: `Checkable::OnStateBeforeSuppressionChanged`
Event Receiver: `StateBeforeSuppressionChangedAPIHandler`
Used to sync the checkable state from before a notification suppression (for example
because the checkable is in a downtime) started within the same HA zone.
##### Permissions
The receiver will not process messages from not configured endpoints.
Message updates will be dropped when:
* Checkable does not exist.
* Origin endpoint is not within the local zone.
#### event::SetSuppressedNotifications <a id="technical-concepts-json-rpc-messages-event-setsupressednotifications"></a>
> Location: `clusterevents.cpp`
Expand Down
26 changes: 20 additions & 6 deletions lib/icinga/checkable-check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,12 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig

if (send_notification && !is_flapping) {
if (!IsPaused()) {
if (suppress_notification) {
/* If there are still some pending suppressed state notification, keep the suppression until these are
* handled by Checkable::FireSuppressedNotifications().
*/
bool pending = GetSuppressedNotifications() & (NotificationRecovery|NotificationProblem);

if (suppress_notification || pending) {
suppressed_types |= (recovery ? NotificationRecovery : NotificationProblem);
} else {
OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr);
Expand All @@ -496,12 +501,21 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
int suppressed_types_before (GetSuppressedNotifications());
int suppressed_types_after (suppressed_types_before | suppressed_types);

for (int conflict : {NotificationProblem | NotificationRecovery, NotificationFlappingStart | NotificationFlappingEnd}) {
/* E.g. problem and recovery notifications neutralize each other. */
const int conflict = NotificationFlappingStart | NotificationFlappingEnd;
if ((suppressed_types_after & conflict) == conflict) {
/* Flapping start and end cancel out each other. */
suppressed_types_after &= ~conflict;
}

if ((suppressed_types_after & conflict) == conflict) {
suppressed_types_after &= ~conflict;
}
const int stateNotifications = NotificationRecovery | NotificationProblem;
if (!(suppressed_types_before & stateNotifications) && (suppressed_types & stateNotifications)) {
/* A state-related notification is suppressed for the first time, store the previous state. When
* notifications are no longer suppressed, this can be compared with the current state to determine
* if a notification must be sent. This is done differently compared to flapping notifications just above
* as for state notifications, problem and recovery don't always cancel each other. For example,
* WARNING -> OK -> CRITICAL generates both types once, but there should still be a notification.
*/
SetStateBeforeSuppression(old_stateType == StateTypeHard ? old_state : ServiceOK);
}

if (suppressed_types_after != suppressed_types_before) {
Expand Down
71 changes: 51 additions & 20 deletions lib/icinga/checkable-notification.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,34 +129,34 @@ void Checkable::UnregisterNotification(const Notification::Ptr& notification)
m_Notifications.erase(notification);
}

static void FireSuppressedNotifications(Checkable* checkable)
void Checkable::FireSuppressedNotifications()
{
if (!checkable->IsActive())
if (!IsActive())
return;

if (checkable->IsPaused())
if (IsPaused())
return;

if (!checkable->GetEnableNotifications())
if (!GetEnableNotifications())
return;

int suppressed_types (checkable->GetSuppressedNotifications());
int suppressed_types (GetSuppressedNotifications());
if (!suppressed_types)
return;

int subtract = 0;

{
LazyInit<bool> wasLastParentRecoveryRecent ([&checkable]() {
auto cr (checkable->GetLastCheckResult());
LazyInit<bool> wasLastParentRecoveryRecent ([this]() {
auto cr (GetLastCheckResult());

if (!cr) {
return true;
}

auto threshold (cr->GetExecutionStart());

for (auto& dep : checkable->GetDependencies()) {
for (auto& dep : GetDependencies()) {
auto parent (dep->GetParent());
ObjectLock oLock (parent);

Expand All @@ -168,13 +168,44 @@ static void FireSuppressedNotifications(Checkable* checkable)
return false;
});

for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) {
if (suppressed_types & (NotificationProblem|NotificationRecovery)) {
CheckResult::Ptr cr = GetLastCheckResult();
NotificationType type = cr && IsStateOK(cr->GetState()) ? NotificationRecovery : NotificationProblem;
bool state_suppressed = NotificationReasonSuppressed(NotificationProblem) || NotificationReasonSuppressed(NotificationRecovery);

/* Only process (i.e. send or dismiss) suppressed state notifications if the following conditions are met:
*
* 1. State notifications are not suppressed at the moment. State notifications must only be removed from
* the suppressed notifications bitset after the reason for the suppression is gone as these bits are
* used as a marker for when to set the state_before_suppression attribute.
* 2. The checkable is in a hard state. Soft states represent a state where we are not certain yet about
* the actual state and wait with sending notifications. If we want to immediately send a notification,
* we might send a recovery notification for something that just started failing or a problem
* notification which might be for an intermittent problem that would have never received a
* notification if there was no suppression as it still was in a soft state. Both cases aren't ideal so
* better wait until we are certain.
* 3. The checkable isn't likely checked soon. For example, if a downtime ended, give the checkable a
* chance to recover afterwards before sending a notification.
* 4. No parent recovered recently. Similar to the previous condition, give the checkable a chance to
* recover after one of its dependencies recovered before sending a notification.
*
* If any of these conditions is not met, processing the suppressed notification is further delayed.
*/
if (!state_suppressed && GetStateType() == StateTypeHard && !IsLikelyToBeCheckedSoon() && !wasLastParentRecoveryRecent.Get()) {
if (NotificationReasonApplies(type)) {
Checkable::OnNotificationsRequested(this, type, cr, "", "", nullptr);
}
subtract |= NotificationRecovery|NotificationProblem;
}
}

for (auto type : {NotificationFlappingStart, NotificationFlappingEnd}) {
if (suppressed_types & type) {
bool still_applies = checkable->NotificationReasonApplies(type);
bool still_applies = NotificationReasonApplies(type);

if (still_applies) {
if (!checkable->NotificationReasonSuppressed(type) && !checkable->IsLikelyToBeCheckedSoon() && !wasLastParentRecoveryRecent.Get()) {
Checkable::OnNotificationsRequested(checkable, type, checkable->GetLastCheckResult(), "", "", nullptr);
if (!NotificationReasonSuppressed(type) && !IsLikelyToBeCheckedSoon() && !wasLastParentRecoveryRecent.Get()) {
Checkable::OnNotificationsRequested(this, type, GetLastCheckResult(), "", "", nullptr);

subtract |= type;
}
Expand All @@ -186,28 +217,28 @@ static void FireSuppressedNotifications(Checkable* checkable)
}

if (subtract) {
ObjectLock olock (checkable);
ObjectLock olock (this);

int suppressed_types_before (checkable->GetSuppressedNotifications());
int suppressed_types_before (GetSuppressedNotifications());
int suppressed_types_after (suppressed_types_before & ~subtract);

if (suppressed_types_after != suppressed_types_before) {
checkable->SetSuppressedNotifications(suppressed_types_after);
SetSuppressedNotifications(suppressed_types_after);
}
}
}

/**
* Re-sends all notifications previously suppressed by e.g. downtimes if the notification reason still applies.
*/
void Checkable::FireSuppressedNotifications(const Timer * const&)
void Checkable::FireSuppressedNotificationsTimer(const Timer * const&)
{
for (auto& host : ConfigType::GetObjectsByType<Host>()) {
::FireSuppressedNotifications(host.get());
host->FireSuppressedNotifications();
}

for (auto& service : ConfigType::GetObjectsByType<Service>()) {
::FireSuppressedNotifications(service.get());
service->FireSuppressedNotifications();
}
}

Expand All @@ -224,12 +255,12 @@ bool Checkable::NotificationReasonApplies(NotificationType type)
case NotificationProblem:
{
auto cr (GetLastCheckResult());
return cr && !IsStateOK(cr->GetState()) && GetStateType() == StateTypeHard;
return cr && !IsStateOK(cr->GetState()) && cr->GetState() != GetStateBeforeSuppression();
}
case NotificationRecovery:
{
auto cr (GetLastCheckResult());
return cr && IsStateOK(cr->GetState());
return cr && IsStateOK(cr->GetState()) && cr->GetState() != GetStateBeforeSuppression();
}
case NotificationFlappingStart:
return IsFlapping();
Expand Down
2 changes: 1 addition & 1 deletion lib/icinga/checkable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ void Checkable::Start(bool runtimeCreated)
boost::call_once(once, []() {
l_CheckablesFireSuppressedNotifications = new Timer();
l_CheckablesFireSuppressedNotifications->SetInterval(5);
l_CheckablesFireSuppressedNotifications->OnTimerExpired.connect(&Checkable::FireSuppressedNotifications);
l_CheckablesFireSuppressedNotifications->OnTimerExpired.connect(&Checkable::FireSuppressedNotificationsTimer);
l_CheckablesFireSuppressedNotifications->Start();

l_CleanDeadlinedExecutions = new Timer();
Expand Down
4 changes: 3 additions & 1 deletion lib/icinga/checkable.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ class Checkable : public ObjectImpl<Checkable>
bool NotificationReasonSuppressed(NotificationType type);
bool IsLikelyToBeCheckedSoon();

void FireSuppressedNotifications();

static void IncreasePendingChecks();
static void DecreasePendingChecks();
static int GetPendingChecks();
Expand Down Expand Up @@ -222,7 +224,7 @@ class Checkable : public ObjectImpl<Checkable>

static void NotifyDowntimeEnd(const Downtime::Ptr& downtime);

static void FireSuppressedNotifications(const Timer * const&);
static void FireSuppressedNotificationsTimer(const Timer * const&);
static void CleanDeadlinedExecutions(const Timer * const&);

/* Comments */
Expand Down
3 changes: 3 additions & 0 deletions lib/icinga/checkable.ti
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,9 @@ abstract class Checkable : CustomVarObject
[state, no_user_view, no_user_modify] int suppressed_notifications {
default {{{ return 0; }}}
};
[state, enum, no_user_view, no_user_modify] ServiceState state_before_suppression {
default {{{ return ServiceOK; }}}
};

[config, navigation] name(Endpoint) command_endpoint (CommandEndpointRaw) {
navigate {{{
Expand Down
64 changes: 64 additions & 0 deletions lib/icinga/clusterevents.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ INITIALIZE_ONCE(&ClusterEvents::StaticInitialize);
REGISTER_APIFUNCTION(CheckResult, event, &ClusterEvents::CheckResultAPIHandler);
REGISTER_APIFUNCTION(SetNextCheck, event, &ClusterEvents::NextCheckChangedAPIHandler);
REGISTER_APIFUNCTION(SetLastCheckStarted, event, &ClusterEvents::LastCheckStartedChangedAPIHandler);
REGISTER_APIFUNCTION(SetStateBeforeSuppression, event, &ClusterEvents::StateBeforeSuppressionChangedAPIHandler);
REGISTER_APIFUNCTION(SetSuppressedNotifications, event, &ClusterEvents::SuppressedNotificationsChangedAPIHandler);
REGISTER_APIFUNCTION(SetSuppressedNotificationTypes, event, &ClusterEvents::SuppressedNotificationTypesChangedAPIHandler);
REGISTER_APIFUNCTION(SetNextNotification, event, &ClusterEvents::NextNotificationChangedAPIHandler);
Expand All @@ -45,6 +46,7 @@ void ClusterEvents::StaticInitialize()
Checkable::OnNewCheckResult.connect(&ClusterEvents::CheckResultHandler);
Checkable::OnNextCheckChanged.connect(&ClusterEvents::NextCheckChangedHandler);
Checkable::OnLastCheckStartedChanged.connect(&ClusterEvents::LastCheckStartedChangedHandler);
Checkable::OnStateBeforeSuppressionChanged.connect(&ClusterEvents::StateBeforeSuppressionChangedHandler);
Checkable::OnSuppressedNotificationsChanged.connect(&ClusterEvents::SuppressedNotificationsChangedHandler);
Notification::OnSuppressedNotificationsChanged.connect(&ClusterEvents::SuppressedNotificationTypesChangedHandler);
Notification::OnNextNotificationChanged.connect(&ClusterEvents::NextNotificationChangedHandler);
Expand Down Expand Up @@ -306,6 +308,68 @@ Value ClusterEvents::LastCheckStartedChangedAPIHandler(const MessageOrigin::Ptr&
return Empty;
}

void ClusterEvents::StateBeforeSuppressionChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin)
{
ApiListener::Ptr listener = ApiListener::GetInstance();

if (!listener)
return;

Host::Ptr host;
Service::Ptr service;
tie(host, service) = GetHostService(checkable);

Dictionary::Ptr params = new Dictionary();
params->Set("host", host->GetName());
if (service)
params->Set("service", service->GetShortName());
params->Set("state_before_suppression", checkable->GetStateBeforeSuppression());

Dictionary::Ptr message = new Dictionary();
message->Set("jsonrpc", "2.0");
message->Set("method", "event::SetStateBeforeSuppression");
message->Set("params", params);

listener->RelayMessage(origin, nullptr, message, true);
}

Value ClusterEvents::StateBeforeSuppressionChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params)
{
Endpoint::Ptr endpoint = origin->FromClient->GetEndpoint();

if (!endpoint) {
Log(LogNotice, "ClusterEvents")
<< "Discarding 'state before suppression changed' message from '" << origin->FromClient->GetIdentity() << "': Invalid endpoint origin (client not allowed).";
return Empty;
}

Host::Ptr host = Host::GetByName(params->Get("host"));

if (!host)
return Empty;

Checkable::Ptr checkable;

if (params->Contains("service"))
checkable = host->GetServiceByShortName(params->Get("service"));
else
checkable = host;

if (!checkable)
return Empty;

if (origin->FromZone && origin->FromZone != Zone::GetLocalZone()) {
Log(LogNotice, "ClusterEvents")
<< "Discarding 'state before suppression changed' message for checkable '" << checkable->GetName()
<< "' from '" << origin->FromClient->GetIdentity() << "': Unauthorized access.";
return Empty;
}

checkable->SetStateBeforeSuppression(ServiceState(int(params->Get("state_before_suppression"))), false, origin);

return Empty;
}

void ClusterEvents::SuppressedNotificationsChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin)
{
ApiListener::Ptr listener = ApiListener::GetInstance();
Expand Down
3 changes: 3 additions & 0 deletions lib/icinga/clusterevents.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ class ClusterEvents
static void LastCheckStartedChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin);
static Value LastCheckStartedChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);

static void StateBeforeSuppressionChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin);
static Value StateBeforeSuppressionChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);

static void SuppressedNotificationsChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin);
static Value SuppressedNotificationsChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);

Expand Down
3 changes: 2 additions & 1 deletion lib/icinga/downtime.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,13 @@ class Downtime final : public ObjectImpl<Downtime>
void TriggerDowntime(double triggerTime);
void SetRemovalInfo(const String& removedBy, double removeTime, const MessageOrigin::Ptr& origin = nullptr);

void OnAllConfigLoaded() override;

static String GetDowntimeIDFromLegacyID(int id);

static DowntimeChildOptions ChildOptionsFromValue(const Value& options);

protected:
void OnAllConfigLoaded() override;
void Start(bool runtimeCreated) override;
void Stop(bool runtimeRemoved) override;

Expand Down
4 changes: 2 additions & 2 deletions lib/icinga/downtime.ti
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ class Downtime : ConfigObject < DowntimeNameComposer
load_after Host;
load_after Service;

[config, protected, required, navigation(host)] name(Host) host_name {
[config, required, navigation(host)] name(Host) host_name {
navigate {{{
return Host::GetByName(GetHostName());
}}}
};
[config, protected, navigation(service)] String service_name {
[config, navigation(service)] String service_name {
track {{{
if (!oldValue.IsEmpty()) {
Service::Ptr service = Service::GetByNamePair(GetHostName(), oldValue);
Expand Down
Loading

0 comments on commit b4fd4c6

Please sign in to comment.