diff --git a/charms/tempest-k8s/.gitignore b/charms/tempest-k8s/.gitignore new file mode 100644 index 00000000..e7991265 --- /dev/null +++ b/charms/tempest-k8s/.gitignore @@ -0,0 +1,2 @@ +# alert rules for this charm are dynamically written to disk +/src/loki_alert_rules/*.rules diff --git a/charms/tempest-k8s/src/charm.py b/charms/tempest-k8s/src/charm.py index 39118cf9..70d79263 100755 --- a/charms/tempest-k8s/src/charm.py +++ b/charms/tempest-k8s/src/charm.py @@ -51,6 +51,10 @@ from ops.model import ( from ops_sunbeam.config_contexts import ( ConfigContext, ) +from utils.alert_rules import ( + ensure_alert_rules_disabled, + update_alert_rules_files, +) from utils.cleanup import ( CleanUpError, run_extensive_cleanup, @@ -73,9 +77,12 @@ from utils.types import ( TempestEnvVariant, ) from utils.validators import ( + Schedule, validated_schedule, ) +LOKI_RELATION_NAME = "logging" + logger = logging.getLogger(__name__) @@ -85,7 +92,11 @@ class TempestConfigurationContext(ConfigContext): def context(self) -> dict: """Tempest context.""" return { - "schedule": self.charm.get_schedule(), + "schedule": ( + self.charm.get_schedule().value + if self.charm.is_schedule_ready() + else "" + ), } @@ -131,31 +142,24 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S): ), ] - def get_schedule(self) -> str: - """Return the schedule option if valid and should be enabled. + def get_schedule(self) -> Schedule: + """Validate and return the schedule from config.""" + return validated_schedule(self.config["schedule"]) - If the schedule option is invalid, - or periodic checks shouldn't currently be enabled - (eg. observability relations not ready), - then return an empty schedule string. - An empty string disables the schedule. + def is_schedule_ready(self) -> bool: + """Check if the schedule is valid and periodic tests should be enabled. + + Return True if the schedule config option is valid, + and pre-requisites for periodic checks are ready. """ - schedule = validated_schedule(self.config["schedule"]) - if not schedule.valid: - return "" - - # if tempest env isn't ready, - # or if the logging relation isn't joined, - # or if keystone isn't ready, - # then we can't start scheduling periodic tests - if not ( - self.is_tempest_ready() + schedule = self.get_schedule() + return ( + schedule.valid + and schedule.value + and self.is_tempest_ready() and self.loki.ready and self.user_id_ops.ready - ): - return "" - - return schedule.value + ) @property def config_contexts(self) -> List[ConfigContext]: @@ -188,7 +192,7 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S): handlers.append(self.user_id_ops) self.loki = LoggingRelationHandler( self, - "logging", + LOKI_RELATION_NAME, self.configure_charm, mandatory="logging" in self.mandatory_relations, ) @@ -322,7 +326,7 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S): logger.info("Configuring the tempest environment") - schedule = validated_schedule(self.config["schedule"]) + schedule = self.get_schedule() if not schedule.valid: raise sunbeam_guard.BlockedExceptionError( f"invalid schedule config: {schedule.err}" @@ -340,6 +344,16 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S): "tempest init failed, see logs for more info" ) + # Ensure the alert rules are in sync with charm config. + if self.is_schedule_ready(): + update_alert_rules_files(schedule) + else: + ensure_alert_rules_disabled() + + if self.loki.ready: + for relation in self.model.relations[LOKI_RELATION_NAME]: + self.loki.interface._handle_alert_rules(relation) + self.status.set(ActiveStatus("")) logger.info("Finished configuring the tempest environment") diff --git a/charms/tempest-k8s/src/handlers.py b/charms/tempest-k8s/src/handlers.py index 9aad01ba..56625c5f 100644 --- a/charms/tempest-k8s/src/handlers.py +++ b/charms/tempest-k8s/src/handlers.py @@ -37,6 +37,9 @@ import ops.model import ops.pebble import ops_sunbeam.container_handlers as sunbeam_chandlers import ops_sunbeam.relation_handlers as sunbeam_rhandlers +from utils.alert_rules import ( + ALERT_RULES_PATH, +) from utils.cleanup import ( CleanUpError, run_extensive_cleanup, @@ -669,7 +672,7 @@ class LoggingRelationHandler(sunbeam_rhandlers.RelationHandler): self.charm, recursive=True, relation_name=self.relation_name, - alert_rules_path="src/loki_alert_rules", + alert_rules_path=ALERT_RULES_PATH, logs_scheme={ "tempest": { "log-files": [ diff --git a/charms/tempest-k8s/src/loki_alert_rules/.gitkeep b/charms/tempest-k8s/src/loki_alert_rules/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/charms/tempest-k8s/src/loki_alert_rules/tests.rules b/charms/tempest-k8s/src/loki_alert_rules/tests.rules deleted file mode 100644 index 624d9f9b..00000000 --- a/charms/tempest-k8s/src/loki_alert_rules/tests.rules +++ /dev/null @@ -1,10 +0,0 @@ -groups: -- name: tempest-failed-tests - rules: - - alert: FailedTests - expr: | - sum_over_time({filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%} |~ "- Failed:" | pattern " - <_>: " | unwrap number_of_tests [5m]) > 0 - labels: - severity: high - annotations: - summary: "Failed tests: {{ $value }}!" diff --git a/charms/tempest-k8s/src/utils/alert_rules.py b/charms/tempest-k8s/src/utils/alert_rules.py new file mode 100644 index 00000000..315b373f --- /dev/null +++ b/charms/tempest-k8s/src/utils/alert_rules.py @@ -0,0 +1,89 @@ +# Copyright 2024 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Working with the loki logging alert rules.""" +import os +from math import ( + ceil, +) + +import yaml +from utils.validators import ( + Schedule, +) + +ALERT_RULES_PATH = "src/loki_alert_rules" +ALERT_RULES_FILE = ALERT_RULES_PATH + "/tests.rules" + +# The default for max_query_length in Loki is now 721h, +# and thus the value in Loki deployed by COS. +# ref. https://github.com/grafana/loki/issues/4509 +# We need a small buffer to make it work in these queries. +MAX_RANGE_HOURS = 719 + + +def ensure_alert_rules_disabled(): + """Ensure the alert rules files don't exist.""" + try: + os.remove(ALERT_RULES_FILE) + except FileNotFoundError: + pass + return + + +def update_alert_rules_files(schedule: Schedule) -> None: + """Update files for alert rules based on the schedule. + + `schedule` is expected to be a valid and ready Schedule. + """ + absent_range_hours = min( + # Convert seconds to hours, + # round up to avoid a range of 0, + # and double the interval to ensure it only alerts when one was definitely missed. + ceil(schedule.max_interval / 60 / 60) * 2, + # Ensure that the log query limit isn't exceeded + MAX_RANGE_HOURS, + ) + + rules = { + "groups": [ + { + "name": "tempest-failed-tests", + "rules": [ + { + "alert": "FailedTests", + "expr": f'last_over_time({{filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%}} |~ "- Failed:" | pattern " - <_>: " | unwrap number_of_tests [{MAX_RANGE_HOURS}h]) > 0', + "labels": { + "severity": "high", + }, + "annotations": { + "summary": "Tempest periodic tests failed.", + }, + }, + { + "alert": "AbsentTests", + "expr": f'absent_over_time({{filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%}} |~ "- Failed:" [{absent_range_hours}h]) == 1', + "labels": { + "severity": "high", + }, + "annotations": { + "summary": "Tempest periodic tests were not run on schedule.", + }, + }, + ], + } + ] + } + + with open(ALERT_RULES_FILE, "w") as f: + yaml.safe_dump(rules, f) diff --git a/charms/tempest-k8s/src/utils/validators.py b/charms/tempest-k8s/src/utils/validators.py index 6106e8ea..ec198c26 100644 --- a/charms/tempest-k8s/src/utils/validators.py +++ b/charms/tempest-k8s/src/utils/validators.py @@ -18,25 +18,37 @@ from dataclasses import ( from datetime import ( datetime, ) +from functools import ( + lru_cache, +) from croniter import ( + CroniterBadDateError, croniter, ) -@dataclass +@dataclass(frozen=True) class Schedule: """A cron schedule that has validation information.""" value: str valid: bool err: str + # in validation, these are the maximum and minimum intervals between runs seen + max_interval: int = 0 # in seconds + min_interval: int = 0 # in seconds +@lru_cache def validated_schedule(schedule: str) -> Schedule: """Process and validate a schedule str. Return the schedule with validation info. + + Part of validation includes sampling a range of matches + for the cron schedule. This can be time consuming, + so this function is cached to avoid repeating work. """ # Empty schedule is fine; it means it's disabled in this context. if not schedule: @@ -66,18 +78,39 @@ def validated_schedule(schedule: str) -> Schedule: ) return Schedule(value=schedule, valid=False, err=msg) - # This is a rather naive method for enforcing this, + # This is a heuristic method of checking because cron schedules aren't regular, # and it may be possible to craft an expression # that results in some consecutive runs within 15 minutes, # however this is fine, as there is process locking for tempest, # and this is more of a sanity check than a security requirement. - t1 = cron.get_next() - t2 = cron.get_next() - if t2 - t1 < 15 * 60: # 15 minutes in seconds + intervals = [] # in seconds + try: + last = cron.get_next() + for _ in range(5): + next_ = cron.get_next() + intervals.append(next_ - last) + last = next_ + except CroniterBadDateError: + return Schedule( + value=schedule, + valid=False, + err=( + "Could not calculate a range of values from the schedule; " + "please check the schedule or try a shorter schedule period." + ), + ) + + if min(intervals) < 15 * 60: # 15 minutes in seconds return Schedule( value=schedule, valid=False, err="Cannot schedule periodic check to run faster than every 15 minutes.", ) - return Schedule(value=schedule, valid=True, err="") + return Schedule( + value=schedule, + valid=True, + err="", + max_interval=max(intervals), + min_interval=min(intervals), + ) diff --git a/charms/tempest-k8s/tests/unit/test_tempest_charm.py b/charms/tempest-k8s/tests/unit/test_tempest_charm.py index d79633c0..ffa389cd 100644 --- a/charms/tempest-k8s/tests/unit/test_tempest_charm.py +++ b/charms/tempest-k8s/tests/unit/test_tempest_charm.py @@ -111,16 +111,6 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase): self.patch_obj( utils.cleanup, "_get_exclusion_resources" ).return_value = {"projects": set(), "users": set()} - # We must keep a reference to the patcher object, - # because in a couple of tests we need to not patch this. - # self.patch_obj doesn't give us a reference to the patcher. - self.get_unit_data_patcher = patch.object( - charm.TempestOperatorCharm, - "get_unit_data", - Mock(return_value="true"), - ) - self.get_unit_data_patcher.start() - self.addCleanup(self.get_unit_data_patcher.stop) def add_identity_ops_relation(self, harness): """Add identity resource relation.""" @@ -491,7 +481,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase): self.harness.update_config({"schedule": "*/21 * * * *"}) - self.harness.charm.set_tempest_ready.has_calls( + self.harness.charm.set_tempest_ready.assert_has_calls( [call(False), call(False)] ) self.assertEqual(self.harness.charm.set_tempest_ready.call_count, 2) @@ -507,9 +497,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase): self.add_identity_ops_relation(self.harness) self.add_grafana_dashboard_relation(self.harness) - # We want the real get_unit_data method here, - # because its logic is being tested. - self.get_unit_data_patcher.stop() + # simulate tempest ready self.harness.charm.peers = Mock() self.harness.charm.peers.interface.peers_rel.data = MagicMock() self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = { @@ -525,9 +513,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase): self.add_identity_ops_relation(self.harness) self.add_grafana_dashboard_relation(self.harness) - # We want the real get_unit_data method here, - # because its logic is being tested. - self.get_unit_data_patcher.stop() + # simulate tempest not ready self.harness.charm.peers = Mock() self.harness.charm.peers.interface.peers_rel.data = MagicMock() self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = { diff --git a/charms/tempest-k8s/tests/unit/test_validators.py b/charms/tempest-k8s/tests/unit/test_validators.py index 29d55c95..3026a170 100644 --- a/charms/tempest-k8s/tests/unit/test_validators.py +++ b/charms/tempest-k8s/tests/unit/test_validators.py @@ -17,6 +17,9 @@ """Unit tests for Tempest validator utility functions.""" import unittest +from dataclasses import ( + FrozenInstanceError, +) from utils.validators import ( validated_schedule, @@ -85,3 +88,21 @@ class TempestCharmValidatorTests(unittest.TestCase): self.assertFalse(schedule.valid) self.assertIn("not acceptable", schedule.err) self.assertEqual(schedule.value, exp) + + def test_expression_too_sparse(self): + """Verify an expression with a very long period is caught.""" + exp = "0 4 30 2 *" # on february 30 ;) + schedule = validated_schedule(exp) + self.assertFalse(schedule.valid) + self.assertIn("not calculate a range", schedule.err) + self.assertEqual(schedule.value, exp) + + def test_schedule_type_is_immutable(self): + """Schedule should be immutable.""" + # this is both to avoid issues with caching it, + # and to ensure a validated schedule is not accidentally modified + # (it should not be modified because then it may not be valid any more) + schedule = validated_schedule("5 4 * * *") + self.assertTrue(schedule.valid) + with self.assertRaises(FrozenInstanceError): + schedule.valid = False