Improve tempest alert rules

Alert if results not seen for some time
(if an expected test run was missed).

Keep the failed test alert firing until a test run succeeds
(this alert would resolve itself after 5min, which was undesirable).

Change-Id: I4280cd181e709cef3bba6a114273cb32c12a9f7d
This commit is contained in:
Samuel Allan 2024-06-11 17:14:33 +09:30
parent 59afdcc479
commit 39cb6917bb
No known key found for this signature in database
GPG Key ID: 622F8E99C893BD61
9 changed files with 196 additions and 58 deletions

2
charms/tempest-k8s/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
# alert rules for this charm are dynamically written to disk
/src/loki_alert_rules/*.rules

View File

@ -51,6 +51,10 @@ from ops.model import (
from ops_sunbeam.config_contexts import (
ConfigContext,
)
from utils.alert_rules import (
ensure_alert_rules_disabled,
update_alert_rules_files,
)
from utils.cleanup import (
CleanUpError,
run_extensive_cleanup,
@ -73,9 +77,12 @@ from utils.types import (
TempestEnvVariant,
)
from utils.validators import (
Schedule,
validated_schedule,
)
LOKI_RELATION_NAME = "logging"
logger = logging.getLogger(__name__)
@ -85,7 +92,11 @@ class TempestConfigurationContext(ConfigContext):
def context(self) -> dict:
"""Tempest context."""
return {
"schedule": self.charm.get_schedule(),
"schedule": (
self.charm.get_schedule().value
if self.charm.is_schedule_ready()
else ""
),
}
@ -131,31 +142,24 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
),
]
def get_schedule(self) -> str:
"""Return the schedule option if valid and should be enabled.
def get_schedule(self) -> Schedule:
"""Validate and return the schedule from config."""
return validated_schedule(self.config["schedule"])
If the schedule option is invalid,
or periodic checks shouldn't currently be enabled
(eg. observability relations not ready),
then return an empty schedule string.
An empty string disables the schedule.
def is_schedule_ready(self) -> bool:
"""Check if the schedule is valid and periodic tests should be enabled.
Return True if the schedule config option is valid,
and pre-requisites for periodic checks are ready.
"""
schedule = validated_schedule(self.config["schedule"])
if not schedule.valid:
return ""
# if tempest env isn't ready,
# or if the logging relation isn't joined,
# or if keystone isn't ready,
# then we can't start scheduling periodic tests
if not (
self.is_tempest_ready()
schedule = self.get_schedule()
return (
schedule.valid
and schedule.value
and self.is_tempest_ready()
and self.loki.ready
and self.user_id_ops.ready
):
return ""
return schedule.value
)
@property
def config_contexts(self) -> List[ConfigContext]:
@ -188,7 +192,7 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
handlers.append(self.user_id_ops)
self.loki = LoggingRelationHandler(
self,
"logging",
LOKI_RELATION_NAME,
self.configure_charm,
mandatory="logging" in self.mandatory_relations,
)
@ -322,7 +326,7 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
logger.info("Configuring the tempest environment")
schedule = validated_schedule(self.config["schedule"])
schedule = self.get_schedule()
if not schedule.valid:
raise sunbeam_guard.BlockedExceptionError(
f"invalid schedule config: {schedule.err}"
@ -340,6 +344,16 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
"tempest init failed, see logs for more info"
)
# Ensure the alert rules are in sync with charm config.
if self.is_schedule_ready():
update_alert_rules_files(schedule)
else:
ensure_alert_rules_disabled()
if self.loki.ready:
for relation in self.model.relations[LOKI_RELATION_NAME]:
self.loki.interface._handle_alert_rules(relation)
self.status.set(ActiveStatus(""))
logger.info("Finished configuring the tempest environment")

View File

@ -37,6 +37,9 @@ import ops.model
import ops.pebble
import ops_sunbeam.container_handlers as sunbeam_chandlers
import ops_sunbeam.relation_handlers as sunbeam_rhandlers
from utils.alert_rules import (
ALERT_RULES_PATH,
)
from utils.cleanup import (
CleanUpError,
run_extensive_cleanup,
@ -669,7 +672,7 @@ class LoggingRelationHandler(sunbeam_rhandlers.RelationHandler):
self.charm,
recursive=True,
relation_name=self.relation_name,
alert_rules_path="src/loki_alert_rules",
alert_rules_path=ALERT_RULES_PATH,
logs_scheme={
"tempest": {
"log-files": [

View File

@ -1,10 +0,0 @@
groups:
- name: tempest-failed-tests
rules:
- alert: FailedTests
expr: |
sum_over_time({filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%} |~ "- Failed:" | pattern " - <_>: <number_of_tests>" | unwrap number_of_tests [5m]) > 0
labels:
severity: high
annotations:
summary: "Failed tests: {{ $value }}!"

View File

@ -0,0 +1,89 @@
# Copyright 2024 Canonical Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Working with the loki logging alert rules."""
import os
from math import (
ceil,
)
import yaml
from utils.validators import (
Schedule,
)
ALERT_RULES_PATH = "src/loki_alert_rules"
ALERT_RULES_FILE = ALERT_RULES_PATH + "/tests.rules"
# The default for max_query_length in Loki is now 721h,
# and thus the value in Loki deployed by COS.
# ref. https://github.com/grafana/loki/issues/4509
# We need a small buffer to make it work in these queries.
MAX_RANGE_HOURS = 719
def ensure_alert_rules_disabled():
"""Ensure the alert rules files don't exist."""
try:
os.remove(ALERT_RULES_FILE)
except FileNotFoundError:
pass
return
def update_alert_rules_files(schedule: Schedule) -> None:
"""Update files for alert rules based on the schedule.
`schedule` is expected to be a valid and ready Schedule.
"""
absent_range_hours = min(
# Convert seconds to hours,
# round up to avoid a range of 0,
# and double the interval to ensure it only alerts when one was definitely missed.
ceil(schedule.max_interval / 60 / 60) * 2,
# Ensure that the log query limit isn't exceeded
MAX_RANGE_HOURS,
)
rules = {
"groups": [
{
"name": "tempest-failed-tests",
"rules": [
{
"alert": "FailedTests",
"expr": f'last_over_time({{filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%}} |~ "- Failed:" | pattern " - <_>: <number_of_tests>" | unwrap number_of_tests [{MAX_RANGE_HOURS}h]) > 0',
"labels": {
"severity": "high",
},
"annotations": {
"summary": "Tempest periodic tests failed.",
},
},
{
"alert": "AbsentTests",
"expr": f'absent_over_time({{filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%}} |~ "- Failed:" [{absent_range_hours}h]) == 1',
"labels": {
"severity": "high",
},
"annotations": {
"summary": "Tempest periodic tests were not run on schedule.",
},
},
],
}
]
}
with open(ALERT_RULES_FILE, "w") as f:
yaml.safe_dump(rules, f)

View File

@ -18,25 +18,37 @@ from dataclasses import (
from datetime import (
datetime,
)
from functools import (
lru_cache,
)
from croniter import (
CroniterBadDateError,
croniter,
)
@dataclass
@dataclass(frozen=True)
class Schedule:
"""A cron schedule that has validation information."""
value: str
valid: bool
err: str
# in validation, these are the maximum and minimum intervals between runs seen
max_interval: int = 0 # in seconds
min_interval: int = 0 # in seconds
@lru_cache
def validated_schedule(schedule: str) -> Schedule:
"""Process and validate a schedule str.
Return the schedule with validation info.
Part of validation includes sampling a range of matches
for the cron schedule. This can be time consuming,
so this function is cached to avoid repeating work.
"""
# Empty schedule is fine; it means it's disabled in this context.
if not schedule:
@ -66,18 +78,39 @@ def validated_schedule(schedule: str) -> Schedule:
)
return Schedule(value=schedule, valid=False, err=msg)
# This is a rather naive method for enforcing this,
# This is a heuristic method of checking because cron schedules aren't regular,
# and it may be possible to craft an expression
# that results in some consecutive runs within 15 minutes,
# however this is fine, as there is process locking for tempest,
# and this is more of a sanity check than a security requirement.
t1 = cron.get_next()
t2 = cron.get_next()
if t2 - t1 < 15 * 60: # 15 minutes in seconds
intervals = [] # in seconds
try:
last = cron.get_next()
for _ in range(5):
next_ = cron.get_next()
intervals.append(next_ - last)
last = next_
except CroniterBadDateError:
return Schedule(
value=schedule,
valid=False,
err=(
"Could not calculate a range of values from the schedule; "
"please check the schedule or try a shorter schedule period."
),
)
if min(intervals) < 15 * 60: # 15 minutes in seconds
return Schedule(
value=schedule,
valid=False,
err="Cannot schedule periodic check to run faster than every 15 minutes.",
)
return Schedule(value=schedule, valid=True, err="")
return Schedule(
value=schedule,
valid=True,
err="",
max_interval=max(intervals),
min_interval=min(intervals),
)

View File

@ -111,16 +111,6 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
self.patch_obj(
utils.cleanup, "_get_exclusion_resources"
).return_value = {"projects": set(), "users": set()}
# We must keep a reference to the patcher object,
# because in a couple of tests we need to not patch this.
# self.patch_obj doesn't give us a reference to the patcher.
self.get_unit_data_patcher = patch.object(
charm.TempestOperatorCharm,
"get_unit_data",
Mock(return_value="true"),
)
self.get_unit_data_patcher.start()
self.addCleanup(self.get_unit_data_patcher.stop)
def add_identity_ops_relation(self, harness):
"""Add identity resource relation."""
@ -491,7 +481,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
self.harness.update_config({"schedule": "*/21 * * * *"})
self.harness.charm.set_tempest_ready.has_calls(
self.harness.charm.set_tempest_ready.assert_has_calls(
[call(False), call(False)]
)
self.assertEqual(self.harness.charm.set_tempest_ready.call_count, 2)
@ -507,9 +497,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
self.add_identity_ops_relation(self.harness)
self.add_grafana_dashboard_relation(self.harness)
# We want the real get_unit_data method here,
# because its logic is being tested.
self.get_unit_data_patcher.stop()
# simulate tempest ready
self.harness.charm.peers = Mock()
self.harness.charm.peers.interface.peers_rel.data = MagicMock()
self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = {
@ -525,9 +513,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
self.add_identity_ops_relation(self.harness)
self.add_grafana_dashboard_relation(self.harness)
# We want the real get_unit_data method here,
# because its logic is being tested.
self.get_unit_data_patcher.stop()
# simulate tempest not ready
self.harness.charm.peers = Mock()
self.harness.charm.peers.interface.peers_rel.data = MagicMock()
self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = {

View File

@ -17,6 +17,9 @@
"""Unit tests for Tempest validator utility functions."""
import unittest
from dataclasses import (
FrozenInstanceError,
)
from utils.validators import (
validated_schedule,
@ -85,3 +88,21 @@ class TempestCharmValidatorTests(unittest.TestCase):
self.assertFalse(schedule.valid)
self.assertIn("not acceptable", schedule.err)
self.assertEqual(schedule.value, exp)
def test_expression_too_sparse(self):
"""Verify an expression with a very long period is caught."""
exp = "0 4 30 2 *" # on february 30 ;)
schedule = validated_schedule(exp)
self.assertFalse(schedule.valid)
self.assertIn("not calculate a range", schedule.err)
self.assertEqual(schedule.value, exp)
def test_schedule_type_is_immutable(self):
"""Schedule should be immutable."""
# this is both to avoid issues with caching it,
# and to ensure a validated schedule is not accidentally modified
# (it should not be modified because then it may not be valid any more)
schedule = validated_schedule("5 4 * * *")
self.assertTrue(schedule.valid)
with self.assertRaises(FrozenInstanceError):
schedule.valid = False