
The current logic checks for nodes that started the join process (based on the snapshot of the environment that was taken by the operator at that point in time). It will not check the state of nodes that it is not aware of, i.e. those that it did not capture initially will not be checked. Hence there is a need to introduce backoff time as it takes a while before all the nodes start to join the Cluster. This is a short term stop gap approach until the Promenade API is ready for consumption Change-Id: I2bdf9c970ecb509fe833fd353e6648a97118d79b
112 lines
4.2 KiB
Python
112 lines
4.2 KiB
Python
# Copyright 2017 AT&T Intellectual Property. All other rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import logging
|
|
import time
|
|
|
|
from airflow.exceptions import AirflowException
|
|
from kubernetes import client, config
|
|
|
|
|
|
def check_node_status(time_out, interval):
|
|
"""This function retrieves the current state of the nodes in the
|
|
Kubernetes cluster. We can use it to check the state of the
|
|
cluster join process (drydock/promenade) and determine if all
|
|
the bare metal nodes have successfully joined the Kubernetes
|
|
cluster.
|
|
|
|
:param time_out: Node should be in Ready state before Time Out
|
|
:param interval: Time interval in which we query node state
|
|
|
|
Example::
|
|
|
|
import time
|
|
from check_k8s_node_status import check_node_status
|
|
|
|
# Wait for a while before checking the cluster-join process as
|
|
# it takes time for process to be triggered across all nodes
|
|
# We will wait for 120 seconds in this example
|
|
time.sleep(120)
|
|
|
|
# Calls function to check that all nodes are in Ready State
|
|
# Time out in this case is set to 15 mins, the time interval
|
|
# has been set to 60 seconds
|
|
check_node_status(900, 60)
|
|
"""
|
|
# Initialize Variable
|
|
not_ready_node_list = []
|
|
|
|
# Note that we are using 'in_cluster_config'
|
|
config.load_incluster_config()
|
|
v1 = client.CoreV1Api()
|
|
|
|
# Logs initial state of all nodes in the cluster
|
|
ret_init = v1.list_node(watch=False)
|
|
|
|
logging.info("Current state of nodes in Cluster is")
|
|
|
|
for i in ret_init.items:
|
|
logging.info("%s\t%s\t%s", i.metadata.name,
|
|
i.status.conditions[-1].status,
|
|
i.status.conditions[-1].type)
|
|
|
|
# Populates the list of nodes in the Cluster
|
|
not_ready_node_list.append(i.metadata.name)
|
|
|
|
# Calculate number of times to execute the 'for' loop
|
|
# Ensure that 'time_out' and 'interval' is passed in as integer
|
|
# The result from the division will be a floating number which
|
|
# We will round off to nearest whole number
|
|
end_range = round(int(time_out) / int(interval))
|
|
|
|
for i in range(0, end_range + 1):
|
|
# Reset node_ready to True for each iteration
|
|
cluster_ready = True
|
|
|
|
# Get updated snapshot view of Cluster for each iteration
|
|
ret = v1.list_node(watch=False)
|
|
|
|
# Check the current state of nodes that are not in Ready state
|
|
# from the previous iteration
|
|
for j in ret.items:
|
|
if j.metadata.name in not_ready_node_list:
|
|
if j.status.conditions[-1].status != 'True':
|
|
# Set cluster_ready to False
|
|
cluster_ready = False
|
|
|
|
# Print current state of node
|
|
logging.info("Node %s is not Ready", j.metadata.name)
|
|
logging.debug("Current status of %s is %s",
|
|
j.metadata.name,
|
|
j.status.conditions[-1].message)
|
|
else:
|
|
# Remove 'Ready' node from list
|
|
not_ready_node_list.remove(j.metadata.name)
|
|
|
|
logging.info("Node %s is in Ready state", j.metadata.name)
|
|
|
|
# Raise Time Out Exception
|
|
if not cluster_ready and i == end_range:
|
|
raise AirflowException("Timed Out! One or more Nodes fail to "
|
|
"get into Ready State!")
|
|
|
|
# Exit loop if Cluster is in Ready state
|
|
if cluster_ready:
|
|
logging.info("All nodes are in Ready state")
|
|
break
|
|
else:
|
|
# Back off and check again in next iteration
|
|
logging.info("Wait for %d seconds...", int(interval))
|
|
time.sleep(int(interval))
|