Improve tempest alert rules

Alert if results not seen for some time (if an expected test run was missed). Keep the failed test alert firing until a test run succeeds (this alert would resolve itself after 5min, which was undesirable). Change-Id: I4280cd181e709cef3bba6a114273cb32c12a9f7d
2024-06-11 17:14:33 +09:30 · 2024-06-11 17:14:33 +09:30 · 39cb6917bb
commit 39cb6917bb
parent 59afdcc479
9 changed files with 196 additions and 58 deletions
--- a/charms/tempest-k8s/.gitignore
+++ b/charms/tempest-k8s/.gitignore
@ -0,0 +1,2 @@
+# alert rules for this charm are dynamically written to disk
+/src/loki_alert_rules/*.rules
--- a/charms/tempest-k8s/src/charm.py
+++ b/charms/tempest-k8s/src/charm.py
@ -51,6 +51,10 @@ from ops.model import (
 from ops_sunbeam.config_contexts import (
    ConfigContext,
 )
+from utils.alert_rules import (
+    ensure_alert_rules_disabled,
+    update_alert_rules_files,
+)
 from utils.cleanup import (
    CleanUpError,
    run_extensive_cleanup,
@ -73,9 +77,12 @@ from utils.types import (
    TempestEnvVariant,
 )
 from utils.validators import (
+    Schedule,
    validated_schedule,
 )

+LOKI_RELATION_NAME = "logging"
+
 logger = logging.getLogger(__name__)


@ -85,7 +92,11 @@ class TempestConfigurationContext(ConfigContext):
    def context(self) -> dict:
        """Tempest context."""
        return {
-            "schedule": self.charm.get_schedule(),
+            "schedule": (
+                self.charm.get_schedule().value
+                if self.charm.is_schedule_ready()
+                else ""
+            ),
        }


@ -131,31 +142,24 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
            ),
        ]

-    def get_schedule(self) -> str:
-        """Return the schedule option if valid and should be enabled.
+    def get_schedule(self) -> Schedule:
+        """Validate and return the schedule from config."""
+        return validated_schedule(self.config["schedule"])

-        If the schedule option is invalid,
-        or periodic checks shouldn't currently be enabled
-        (eg. observability relations not ready),
-        then return an empty schedule string.
-        An empty string disables the schedule.
+    def is_schedule_ready(self) -> bool:
+        """Check if the schedule is valid and periodic tests should be enabled.
+
+        Return True if the schedule config option is valid,
+        and pre-requisites for periodic checks are ready.
        """
-        schedule = validated_schedule(self.config["schedule"])
-        if not schedule.valid:
-            return ""
-
-        # if tempest env isn't ready,
-        # or if the logging relation isn't joined,
-        # or if keystone isn't ready,
-        # then we can't start scheduling periodic tests
-        if not (
-            self.is_tempest_ready()
+        schedule = self.get_schedule()
+        return (
+            schedule.valid
+            and schedule.value
+            and self.is_tempest_ready()
            and self.loki.ready
            and self.user_id_ops.ready
-        ):
-            return ""
-
-        return schedule.value
+        )

    @property
    def config_contexts(self) -> List[ConfigContext]:
@ -188,7 +192,7 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
        handlers.append(self.user_id_ops)
        self.loki = LoggingRelationHandler(
            self,
-            "logging",
+            LOKI_RELATION_NAME,
            self.configure_charm,
            mandatory="logging" in self.mandatory_relations,
        )
@ -322,7 +326,7 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):

        logger.info("Configuring the tempest environment")

-        schedule = validated_schedule(self.config["schedule"])
+        schedule = self.get_schedule()
        if not schedule.valid:
            raise sunbeam_guard.BlockedExceptionError(
                f"invalid schedule config: {schedule.err}"
@ -340,6 +344,16 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
                "tempest init failed, see logs for more info"
            )

+        # Ensure the alert rules are in sync with charm config.
+        if self.is_schedule_ready():
+            update_alert_rules_files(schedule)
+        else:
+            ensure_alert_rules_disabled()
+
+        if self.loki.ready:
+            for relation in self.model.relations[LOKI_RELATION_NAME]:
+                self.loki.interface._handle_alert_rules(relation)
+
        self.status.set(ActiveStatus(""))
        logger.info("Finished configuring the tempest environment")

--- a/charms/tempest-k8s/src/handlers.py
+++ b/charms/tempest-k8s/src/handlers.py
@ -37,6 +37,9 @@ import ops.model
 import ops.pebble
 import ops_sunbeam.container_handlers as sunbeam_chandlers
 import ops_sunbeam.relation_handlers as sunbeam_rhandlers
+from utils.alert_rules import (
+    ALERT_RULES_PATH,
+)
 from utils.cleanup import (
    CleanUpError,
    run_extensive_cleanup,
@ -669,7 +672,7 @@ class LoggingRelationHandler(sunbeam_rhandlers.RelationHandler):
            self.charm,
            recursive=True,
            relation_name=self.relation_name,
-            alert_rules_path="src/loki_alert_rules",
+            alert_rules_path=ALERT_RULES_PATH,
            logs_scheme={
                "tempest": {
                    "log-files": [
--- a/charms/tempest-k8s/src/loki_alert_rules/.gitkeep
+++ b/charms/tempest-k8s/src/loki_alert_rules/.gitkeep
--- a/charms/tempest-k8s/src/loki_alert_rules/tests.rules
+++ b/charms/tempest-k8s/src/loki_alert_rules/tests.rules
@ -1,10 +0,0 @@
-groups:
- name: tempest-failed-tests
-  rules:
-  - alert: FailedTests
-    expr: |
-      sum_over_time({filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%} |~ "- Failed:" | pattern " - <_>: <number_of_tests>" | unwrap number_of_tests [5m]) > 0
-    labels:
-      severity: high
-    annotations:
-      summary: "Failed tests: {{ $value }}!"
--- a/charms/tempest-k8s/src/utils/alert_rules.py
+++ b/charms/tempest-k8s/src/utils/alert_rules.py
@ -0,0 +1,89 @@
+# Copyright 2024 Canonical Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Working with the loki logging alert rules."""
+import os
+from math import (
+    ceil,
+)
+
+import yaml
+from utils.validators import (
+    Schedule,
+)
+
+ALERT_RULES_PATH = "src/loki_alert_rules"
+ALERT_RULES_FILE = ALERT_RULES_PATH + "/tests.rules"
+
+# The default for max_query_length in Loki is now 721h,
+# and thus the value in Loki deployed by COS.
+# ref. https://github.com/grafana/loki/issues/4509
+# We need a small buffer to make it work in these queries.
+MAX_RANGE_HOURS = 719
+
+
+def ensure_alert_rules_disabled():
+    """Ensure the alert rules files don't exist."""
+    try:
+        os.remove(ALERT_RULES_FILE)
+    except FileNotFoundError:
+        pass
+    return
+
+
+def update_alert_rules_files(schedule: Schedule) -> None:
+    """Update files for alert rules based on the schedule.
+
+    `schedule` is expected to be a valid and ready Schedule.
+    """
+    absent_range_hours = min(
+        # Convert seconds to hours,
+        # round up to avoid a range of 0,
+        # and double the interval to ensure it only alerts when one was definitely missed.
+        ceil(schedule.max_interval / 60 / 60) * 2,
+        # Ensure that the log query limit isn't exceeded
+        MAX_RANGE_HOURS,
+    )
+
+    rules = {
+        "groups": [
+            {
+                "name": "tempest-failed-tests",
+                "rules": [
+                    {
+                        "alert": "FailedTests",
+                        "expr": f'last_over_time({{filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%}} |~ "- Failed:" | pattern " - <_>: <number_of_tests>" | unwrap number_of_tests [{MAX_RANGE_HOURS}h]) > 0',
+                        "labels": {
+                            "severity": "high",
+                        },
+                        "annotations": {
+                            "summary": "Tempest periodic tests failed.",
+                        },
+                    },
+                    {
+                        "alert": "AbsentTests",
+                        "expr": f'absent_over_time({{filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%}} |~ "- Failed:" [{absent_range_hours}h]) == 1',
+                        "labels": {
+                            "severity": "high",
+                        },
+                        "annotations": {
+                            "summary": "Tempest periodic tests were not run on schedule.",
+                        },
+                    },
+                ],
+            }
+        ]
+    }
+
+    with open(ALERT_RULES_FILE, "w") as f:
+        yaml.safe_dump(rules, f)
--- a/charms/tempest-k8s/src/utils/validators.py
+++ b/charms/tempest-k8s/src/utils/validators.py
@ -18,25 +18,37 @@ from dataclasses import (
 from datetime import (
    datetime,
 )
+from functools import (
+    lru_cache,
+)

 from croniter import (
+    CroniterBadDateError,
    croniter,
 )


-@dataclass
+@dataclass(frozen=True)
 class Schedule:
    """A cron schedule that has validation information."""

    value: str
    valid: bool
    err: str
+    # in validation, these are the maximum and minimum intervals between runs seen
+    max_interval: int = 0  # in seconds
+    min_interval: int = 0  # in seconds


+@lru_cache
 def validated_schedule(schedule: str) -> Schedule:
    """Process and validate a schedule str.

    Return the schedule with validation info.
+
+    Part of validation includes sampling a range of matches
+    for the cron schedule.  This can be time consuming,
+    so this function is cached to avoid repeating work.
    """
    # Empty schedule is fine; it means it's disabled in this context.
    if not schedule:
@ -66,18 +78,39 @@ def validated_schedule(schedule: str) -> Schedule:
            )
        return Schedule(value=schedule, valid=False, err=msg)

-    # This is a rather naive method for enforcing this,
+    # This is a heuristic method of checking because cron schedules aren't regular,
    # and it may be possible to craft an expression
    # that results in some consecutive runs within 15 minutes,
    # however this is fine, as there is process locking for tempest,
    # and this is more of a sanity check than a security requirement.
-    t1 = cron.get_next()
-    t2 = cron.get_next()
-    if t2 - t1 < 15 * 60:  # 15 minutes in seconds
+    intervals = []  # in seconds
+    try:
+        last = cron.get_next()
+        for _ in range(5):
+            next_ = cron.get_next()
+            intervals.append(next_ - last)
+            last = next_
+    except CroniterBadDateError:
+        return Schedule(
+            value=schedule,
+            valid=False,
+            err=(
+                "Could not calculate a range of values from the schedule; "
+                "please check the schedule or try a shorter schedule period."
+            ),
+        )
+
+    if min(intervals) < 15 * 60:  # 15 minutes in seconds
        return Schedule(
            value=schedule,
            valid=False,
            err="Cannot schedule periodic check to run faster than every 15 minutes.",
        )

-    return Schedule(value=schedule, valid=True, err="")
+    return Schedule(
+        value=schedule,
+        valid=True,
+        err="",
+        max_interval=max(intervals),
+        min_interval=min(intervals),
+    )
--- a/charms/tempest-k8s/tests/unit/test_tempest_charm.py
+++ b/charms/tempest-k8s/tests/unit/test_tempest_charm.py
@ -111,16 +111,6 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
        self.patch_obj(
            utils.cleanup, "_get_exclusion_resources"
        ).return_value = {"projects": set(), "users": set()}
-        # We must keep a reference to the patcher object,
-        # because in a couple of tests we need to not patch this.
-        # self.patch_obj doesn't give us a reference to the patcher.
-        self.get_unit_data_patcher = patch.object(
-            charm.TempestOperatorCharm,
-            "get_unit_data",
-            Mock(return_value="true"),
-        )
-        self.get_unit_data_patcher.start()
-        self.addCleanup(self.get_unit_data_patcher.stop)

    def add_identity_ops_relation(self, harness):
        """Add identity resource relation."""
@ -491,7 +481,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):

        self.harness.update_config({"schedule": "*/21 * * * *"})

-        self.harness.charm.set_tempest_ready.has_calls(
+        self.harness.charm.set_tempest_ready.assert_has_calls(
            [call(False), call(False)]
        )
        self.assertEqual(self.harness.charm.set_tempest_ready.call_count, 2)
@ -507,9 +497,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
        self.add_identity_ops_relation(self.harness)
        self.add_grafana_dashboard_relation(self.harness)

-        # We want the real get_unit_data method here,
-        # because its logic is being tested.
-        self.get_unit_data_patcher.stop()
+        # simulate tempest ready
        self.harness.charm.peers = Mock()
        self.harness.charm.peers.interface.peers_rel.data = MagicMock()
        self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = {
@ -525,9 +513,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
        self.add_identity_ops_relation(self.harness)
        self.add_grafana_dashboard_relation(self.harness)

-        # We want the real get_unit_data method here,
-        # because its logic is being tested.
-        self.get_unit_data_patcher.stop()
+        # simulate tempest not ready
        self.harness.charm.peers = Mock()
        self.harness.charm.peers.interface.peers_rel.data = MagicMock()
        self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = {
--- a/charms/tempest-k8s/tests/unit/test_validators.py
+++ b/charms/tempest-k8s/tests/unit/test_validators.py
@ -17,6 +17,9 @@
 """Unit tests for Tempest validator utility functions."""

 import unittest
+from dataclasses import (
+    FrozenInstanceError,
+)

 from utils.validators import (
    validated_schedule,
@ -85,3 +88,21 @@ class TempestCharmValidatorTests(unittest.TestCase):
        self.assertFalse(schedule.valid)
        self.assertIn("not acceptable", schedule.err)
        self.assertEqual(schedule.value, exp)
+
+    def test_expression_too_sparse(self):
+        """Verify an expression with a very long period is caught."""
+        exp = "0 4 30 2 *"  # on february 30  ;)
+        schedule = validated_schedule(exp)
+        self.assertFalse(schedule.valid)
+        self.assertIn("not calculate a range", schedule.err)
+        self.assertEqual(schedule.value, exp)
+
+    def test_schedule_type_is_immutable(self):
+        """Schedule should be immutable."""
+        # this is both to avoid issues with caching it,
+        # and to ensure a validated schedule is not accidentally modified
+        # (it should not be modified because then it may not be valid any more)
+        schedule = validated_schedule("5 4 * * *")
+        self.assertTrue(schedule.valid)
+        with self.assertRaises(FrozenInstanceError):
+            schedule.valid = False