Improve tempest alert rules
Alert if results not seen for some time (if an expected test run was missed). Keep the failed test alert firing until a test run succeeds (this alert would resolve itself after 5min, which was undesirable). Change-Id: I4280cd181e709cef3bba6a114273cb32c12a9f7d
This commit is contained in:
parent
59afdcc479
commit
39cb6917bb
2
charms/tempest-k8s/.gitignore
vendored
Normal file
2
charms/tempest-k8s/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
# alert rules for this charm are dynamically written to disk
|
||||
/src/loki_alert_rules/*.rules
|
@ -51,6 +51,10 @@ from ops.model import (
|
||||
from ops_sunbeam.config_contexts import (
|
||||
ConfigContext,
|
||||
)
|
||||
from utils.alert_rules import (
|
||||
ensure_alert_rules_disabled,
|
||||
update_alert_rules_files,
|
||||
)
|
||||
from utils.cleanup import (
|
||||
CleanUpError,
|
||||
run_extensive_cleanup,
|
||||
@ -73,9 +77,12 @@ from utils.types import (
|
||||
TempestEnvVariant,
|
||||
)
|
||||
from utils.validators import (
|
||||
Schedule,
|
||||
validated_schedule,
|
||||
)
|
||||
|
||||
LOKI_RELATION_NAME = "logging"
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -85,7 +92,11 @@ class TempestConfigurationContext(ConfigContext):
|
||||
def context(self) -> dict:
|
||||
"""Tempest context."""
|
||||
return {
|
||||
"schedule": self.charm.get_schedule(),
|
||||
"schedule": (
|
||||
self.charm.get_schedule().value
|
||||
if self.charm.is_schedule_ready()
|
||||
else ""
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
@ -131,31 +142,24 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
|
||||
),
|
||||
]
|
||||
|
||||
def get_schedule(self) -> str:
|
||||
"""Return the schedule option if valid and should be enabled.
|
||||
def get_schedule(self) -> Schedule:
|
||||
"""Validate and return the schedule from config."""
|
||||
return validated_schedule(self.config["schedule"])
|
||||
|
||||
If the schedule option is invalid,
|
||||
or periodic checks shouldn't currently be enabled
|
||||
(eg. observability relations not ready),
|
||||
then return an empty schedule string.
|
||||
An empty string disables the schedule.
|
||||
def is_schedule_ready(self) -> bool:
|
||||
"""Check if the schedule is valid and periodic tests should be enabled.
|
||||
|
||||
Return True if the schedule config option is valid,
|
||||
and pre-requisites for periodic checks are ready.
|
||||
"""
|
||||
schedule = validated_schedule(self.config["schedule"])
|
||||
if not schedule.valid:
|
||||
return ""
|
||||
|
||||
# if tempest env isn't ready,
|
||||
# or if the logging relation isn't joined,
|
||||
# or if keystone isn't ready,
|
||||
# then we can't start scheduling periodic tests
|
||||
if not (
|
||||
self.is_tempest_ready()
|
||||
schedule = self.get_schedule()
|
||||
return (
|
||||
schedule.valid
|
||||
and schedule.value
|
||||
and self.is_tempest_ready()
|
||||
and self.loki.ready
|
||||
and self.user_id_ops.ready
|
||||
):
|
||||
return ""
|
||||
|
||||
return schedule.value
|
||||
)
|
||||
|
||||
@property
|
||||
def config_contexts(self) -> List[ConfigContext]:
|
||||
@ -188,7 +192,7 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
|
||||
handlers.append(self.user_id_ops)
|
||||
self.loki = LoggingRelationHandler(
|
||||
self,
|
||||
"logging",
|
||||
LOKI_RELATION_NAME,
|
||||
self.configure_charm,
|
||||
mandatory="logging" in self.mandatory_relations,
|
||||
)
|
||||
@ -322,7 +326,7 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
|
||||
|
||||
logger.info("Configuring the tempest environment")
|
||||
|
||||
schedule = validated_schedule(self.config["schedule"])
|
||||
schedule = self.get_schedule()
|
||||
if not schedule.valid:
|
||||
raise sunbeam_guard.BlockedExceptionError(
|
||||
f"invalid schedule config: {schedule.err}"
|
||||
@ -340,6 +344,16 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
|
||||
"tempest init failed, see logs for more info"
|
||||
)
|
||||
|
||||
# Ensure the alert rules are in sync with charm config.
|
||||
if self.is_schedule_ready():
|
||||
update_alert_rules_files(schedule)
|
||||
else:
|
||||
ensure_alert_rules_disabled()
|
||||
|
||||
if self.loki.ready:
|
||||
for relation in self.model.relations[LOKI_RELATION_NAME]:
|
||||
self.loki.interface._handle_alert_rules(relation)
|
||||
|
||||
self.status.set(ActiveStatus(""))
|
||||
logger.info("Finished configuring the tempest environment")
|
||||
|
||||
|
@ -37,6 +37,9 @@ import ops.model
|
||||
import ops.pebble
|
||||
import ops_sunbeam.container_handlers as sunbeam_chandlers
|
||||
import ops_sunbeam.relation_handlers as sunbeam_rhandlers
|
||||
from utils.alert_rules import (
|
||||
ALERT_RULES_PATH,
|
||||
)
|
||||
from utils.cleanup import (
|
||||
CleanUpError,
|
||||
run_extensive_cleanup,
|
||||
@ -669,7 +672,7 @@ class LoggingRelationHandler(sunbeam_rhandlers.RelationHandler):
|
||||
self.charm,
|
||||
recursive=True,
|
||||
relation_name=self.relation_name,
|
||||
alert_rules_path="src/loki_alert_rules",
|
||||
alert_rules_path=ALERT_RULES_PATH,
|
||||
logs_scheme={
|
||||
"tempest": {
|
||||
"log-files": [
|
||||
|
0
charms/tempest-k8s/src/loki_alert_rules/.gitkeep
Normal file
0
charms/tempest-k8s/src/loki_alert_rules/.gitkeep
Normal file
@ -1,10 +0,0 @@
|
||||
groups:
|
||||
- name: tempest-failed-tests
|
||||
rules:
|
||||
- alert: FailedTests
|
||||
expr: |
|
||||
sum_over_time({filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%} |~ "- Failed:" | pattern " - <_>: <number_of_tests>" | unwrap number_of_tests [5m]) > 0
|
||||
labels:
|
||||
severity: high
|
||||
annotations:
|
||||
summary: "Failed tests: {{ $value }}!"
|
89
charms/tempest-k8s/src/utils/alert_rules.py
Normal file
89
charms/tempest-k8s/src/utils/alert_rules.py
Normal file
@ -0,0 +1,89 @@
|
||||
# Copyright 2024 Canonical Ltd.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Working with the loki logging alert rules."""
|
||||
import os
|
||||
from math import (
|
||||
ceil,
|
||||
)
|
||||
|
||||
import yaml
|
||||
from utils.validators import (
|
||||
Schedule,
|
||||
)
|
||||
|
||||
ALERT_RULES_PATH = "src/loki_alert_rules"
|
||||
ALERT_RULES_FILE = ALERT_RULES_PATH + "/tests.rules"
|
||||
|
||||
# The default for max_query_length in Loki is now 721h,
|
||||
# and thus the value in Loki deployed by COS.
|
||||
# ref. https://github.com/grafana/loki/issues/4509
|
||||
# We need a small buffer to make it work in these queries.
|
||||
MAX_RANGE_HOURS = 719
|
||||
|
||||
|
||||
def ensure_alert_rules_disabled():
|
||||
"""Ensure the alert rules files don't exist."""
|
||||
try:
|
||||
os.remove(ALERT_RULES_FILE)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
return
|
||||
|
||||
|
||||
def update_alert_rules_files(schedule: Schedule) -> None:
|
||||
"""Update files for alert rules based on the schedule.
|
||||
|
||||
`schedule` is expected to be a valid and ready Schedule.
|
||||
"""
|
||||
absent_range_hours = min(
|
||||
# Convert seconds to hours,
|
||||
# round up to avoid a range of 0,
|
||||
# and double the interval to ensure it only alerts when one was definitely missed.
|
||||
ceil(schedule.max_interval / 60 / 60) * 2,
|
||||
# Ensure that the log query limit isn't exceeded
|
||||
MAX_RANGE_HOURS,
|
||||
)
|
||||
|
||||
rules = {
|
||||
"groups": [
|
||||
{
|
||||
"name": "tempest-failed-tests",
|
||||
"rules": [
|
||||
{
|
||||
"alert": "FailedTests",
|
||||
"expr": f'last_over_time({{filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%}} |~ "- Failed:" | pattern " - <_>: <number_of_tests>" | unwrap number_of_tests [{MAX_RANGE_HOURS}h]) > 0',
|
||||
"labels": {
|
||||
"severity": "high",
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "Tempest periodic tests failed.",
|
||||
},
|
||||
},
|
||||
{
|
||||
"alert": "AbsentTests",
|
||||
"expr": f'absent_over_time({{filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%}} |~ "- Failed:" [{absent_range_hours}h]) == 1',
|
||||
"labels": {
|
||||
"severity": "high",
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "Tempest periodic tests were not run on schedule.",
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
with open(ALERT_RULES_FILE, "w") as f:
|
||||
yaml.safe_dump(rules, f)
|
@ -18,25 +18,37 @@ from dataclasses import (
|
||||
from datetime import (
|
||||
datetime,
|
||||
)
|
||||
from functools import (
|
||||
lru_cache,
|
||||
)
|
||||
|
||||
from croniter import (
|
||||
CroniterBadDateError,
|
||||
croniter,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(frozen=True)
|
||||
class Schedule:
|
||||
"""A cron schedule that has validation information."""
|
||||
|
||||
value: str
|
||||
valid: bool
|
||||
err: str
|
||||
# in validation, these are the maximum and minimum intervals between runs seen
|
||||
max_interval: int = 0 # in seconds
|
||||
min_interval: int = 0 # in seconds
|
||||
|
||||
|
||||
@lru_cache
|
||||
def validated_schedule(schedule: str) -> Schedule:
|
||||
"""Process and validate a schedule str.
|
||||
|
||||
Return the schedule with validation info.
|
||||
|
||||
Part of validation includes sampling a range of matches
|
||||
for the cron schedule. This can be time consuming,
|
||||
so this function is cached to avoid repeating work.
|
||||
"""
|
||||
# Empty schedule is fine; it means it's disabled in this context.
|
||||
if not schedule:
|
||||
@ -66,18 +78,39 @@ def validated_schedule(schedule: str) -> Schedule:
|
||||
)
|
||||
return Schedule(value=schedule, valid=False, err=msg)
|
||||
|
||||
# This is a rather naive method for enforcing this,
|
||||
# This is a heuristic method of checking because cron schedules aren't regular,
|
||||
# and it may be possible to craft an expression
|
||||
# that results in some consecutive runs within 15 minutes,
|
||||
# however this is fine, as there is process locking for tempest,
|
||||
# and this is more of a sanity check than a security requirement.
|
||||
t1 = cron.get_next()
|
||||
t2 = cron.get_next()
|
||||
if t2 - t1 < 15 * 60: # 15 minutes in seconds
|
||||
intervals = [] # in seconds
|
||||
try:
|
||||
last = cron.get_next()
|
||||
for _ in range(5):
|
||||
next_ = cron.get_next()
|
||||
intervals.append(next_ - last)
|
||||
last = next_
|
||||
except CroniterBadDateError:
|
||||
return Schedule(
|
||||
value=schedule,
|
||||
valid=False,
|
||||
err=(
|
||||
"Could not calculate a range of values from the schedule; "
|
||||
"please check the schedule or try a shorter schedule period."
|
||||
),
|
||||
)
|
||||
|
||||
if min(intervals) < 15 * 60: # 15 minutes in seconds
|
||||
return Schedule(
|
||||
value=schedule,
|
||||
valid=False,
|
||||
err="Cannot schedule periodic check to run faster than every 15 minutes.",
|
||||
)
|
||||
|
||||
return Schedule(value=schedule, valid=True, err="")
|
||||
return Schedule(
|
||||
value=schedule,
|
||||
valid=True,
|
||||
err="",
|
||||
max_interval=max(intervals),
|
||||
min_interval=min(intervals),
|
||||
)
|
||||
|
@ -111,16 +111,6 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
|
||||
self.patch_obj(
|
||||
utils.cleanup, "_get_exclusion_resources"
|
||||
).return_value = {"projects": set(), "users": set()}
|
||||
# We must keep a reference to the patcher object,
|
||||
# because in a couple of tests we need to not patch this.
|
||||
# self.patch_obj doesn't give us a reference to the patcher.
|
||||
self.get_unit_data_patcher = patch.object(
|
||||
charm.TempestOperatorCharm,
|
||||
"get_unit_data",
|
||||
Mock(return_value="true"),
|
||||
)
|
||||
self.get_unit_data_patcher.start()
|
||||
self.addCleanup(self.get_unit_data_patcher.stop)
|
||||
|
||||
def add_identity_ops_relation(self, harness):
|
||||
"""Add identity resource relation."""
|
||||
@ -491,7 +481,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
|
||||
|
||||
self.harness.update_config({"schedule": "*/21 * * * *"})
|
||||
|
||||
self.harness.charm.set_tempest_ready.has_calls(
|
||||
self.harness.charm.set_tempest_ready.assert_has_calls(
|
||||
[call(False), call(False)]
|
||||
)
|
||||
self.assertEqual(self.harness.charm.set_tempest_ready.call_count, 2)
|
||||
@ -507,9 +497,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
|
||||
self.add_identity_ops_relation(self.harness)
|
||||
self.add_grafana_dashboard_relation(self.harness)
|
||||
|
||||
# We want the real get_unit_data method here,
|
||||
# because its logic is being tested.
|
||||
self.get_unit_data_patcher.stop()
|
||||
# simulate tempest ready
|
||||
self.harness.charm.peers = Mock()
|
||||
self.harness.charm.peers.interface.peers_rel.data = MagicMock()
|
||||
self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = {
|
||||
@ -525,9 +513,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
|
||||
self.add_identity_ops_relation(self.harness)
|
||||
self.add_grafana_dashboard_relation(self.harness)
|
||||
|
||||
# We want the real get_unit_data method here,
|
||||
# because its logic is being tested.
|
||||
self.get_unit_data_patcher.stop()
|
||||
# simulate tempest not ready
|
||||
self.harness.charm.peers = Mock()
|
||||
self.harness.charm.peers.interface.peers_rel.data = MagicMock()
|
||||
self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = {
|
||||
|
@ -17,6 +17,9 @@
|
||||
"""Unit tests for Tempest validator utility functions."""
|
||||
|
||||
import unittest
|
||||
from dataclasses import (
|
||||
FrozenInstanceError,
|
||||
)
|
||||
|
||||
from utils.validators import (
|
||||
validated_schedule,
|
||||
@ -85,3 +88,21 @@ class TempestCharmValidatorTests(unittest.TestCase):
|
||||
self.assertFalse(schedule.valid)
|
||||
self.assertIn("not acceptable", schedule.err)
|
||||
self.assertEqual(schedule.value, exp)
|
||||
|
||||
def test_expression_too_sparse(self):
|
||||
"""Verify an expression with a very long period is caught."""
|
||||
exp = "0 4 30 2 *" # on february 30 ;)
|
||||
schedule = validated_schedule(exp)
|
||||
self.assertFalse(schedule.valid)
|
||||
self.assertIn("not calculate a range", schedule.err)
|
||||
self.assertEqual(schedule.value, exp)
|
||||
|
||||
def test_schedule_type_is_immutable(self):
|
||||
"""Schedule should be immutable."""
|
||||
# this is both to avoid issues with caching it,
|
||||
# and to ensure a validated schedule is not accidentally modified
|
||||
# (it should not be modified because then it may not be valid any more)
|
||||
schedule = validated_schedule("5 4 * * *")
|
||||
self.assertTrue(schedule.valid)
|
||||
with self.assertRaises(FrozenInstanceError):
|
||||
schedule.valid = False
|
||||
|
Loading…
x
Reference in New Issue
Block a user