Emilien Macchi 42c4dc36bc heat-engine: disable the check on database connection
After some investigations, the mysql connection is not persistent and is
only established when heat API is used by a client.
By checking if the TCP connection is actually established, we have a fail
every time Pacemaker checks. If no Heat API requests have been sent for
some time, Heat will be considered as failed and will be restart which
is not clean.

This patch aims to disable this check and only let AMQP connection
check.

Signed-off-by: Emilien Macchi <emilien.macchi@enovance.com>
2014-02-17 11:07:59 +01:00

358 lines
13 KiB
Bash

#!/bin/sh
#
#
# OpenStack Orchestration Engine Service (heat-engine)
#
# Description: Manages an OpenStack Orchestration Engine Service (heat-engine) process as an HA resource
#
# Authors: Emilien Macchi
#
# Support: openstack@lists.launchpad.net
# License: Apache Software License (ASL) 2.0
#
#
# See usage() function below for more details ...
#
# OCF instance parameters:
# OCF_RESKEY_binary
# OCF_RESKEY_config
# OCF_RESKEY_user
# OCF_RESKEY_pid
# OCF_RESKEY_monitor_binary
# OCF_RESKEY_amqp_server_port
# OCF_RESKEY_zeromq
# OCF_RESKEY_additional_parameters
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
#######################################################################
# Fill in some defaults if no values are specified
OCF_RESKEY_binary_default="heat-engine"
OCF_RESKEY_config_default="/etc/heat/heat.conf"
OCF_RESKEY_user_default="heat"
OCF_RESKEY_pid_default="$HA_RSCTMP/$OCF_RESOURCE_INSTANCE.pid"
OCF_RESKEY_amqp_server_port_default="5672"
OCF_RESKEY_zeromq_default="false"
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}}
: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}}
: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}}
: ${OCF_RESKEY_amqp_server_port=${OCF_RESKEY_amqp_server_port_default}}
: ${OCF_RESKEY_zeromq=${OCF_RESKEY_zeromq_default}}
#######################################################################
usage() {
cat <<UEND
usage: $0 (start|stop|validate-all|meta-data|status|monitor)
$0 manages an OpenStack Orchestration Engine Service (heat-engine) process as an HA resource
The 'start' operation starts the heat-engine service.
The 'stop' operation stops the heat-engine service.
The 'validate-all' operation reports whether the parameters are valid
The 'meta-data' operation reports this RA's meta-data information
The 'status' operation reports whether the heat-engine service is running
The 'monitor' operation reports whether the heat-engine service seems to be working
UEND
}
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="heat-engine">
<version>1.0</version>
<longdesc lang="en">
Resource agent for the OpenStack Orchestration Engine Service (heat-engine)
May manage a heat-engine instance or a clone set that
creates a distributed heat-engine cluster.
</longdesc>
<shortdesc lang="en">Manages the OpenStack Orchestration Engine Service (heat-engine)</shortdesc>
<parameters>
<parameter name="binary" unique="0" required="0">
<longdesc lang="en">
Location of the OpenStack Orchestration Engine server binary (heat-engine)
</longdesc>
<shortdesc lang="en">OpenStack Orchestration Engine server binary (heat-engine)</shortdesc>
<content type="string" default="${OCF_RESKEY_binary_default}" />
</parameter>
<parameter name="config" unique="0" required="0">
<longdesc lang="en">
Location of the OpenStack Orchestration Engine Service (heat-engine) configuration file
</longdesc>
<shortdesc lang="en">OpenStack Orchestration Engine (heat-engine) config file</shortdesc>
<content type="string" default="${OCF_RESKEY_config_default}" />
</parameter>
<parameter name="user" unique="0" required="0">
<longdesc lang="en">
User running OpenStack Orchestration Engine Service (heat-engine)
</longdesc>
<shortdesc lang="en">OpenStack Orchestration Engine Service (heat-engine) user</shortdesc>
<content type="string" default="${OCF_RESKEY_user_default}" />
</parameter>
<parameter name="pid" unique="0" required="0">
<longdesc lang="en">
The pid file to use for this OpenStack Orchestration Engine Service (heat-engine) instance
</longdesc>
<shortdesc lang="en">OpenStack Orchestration Engine Service (heat-engine) pid file</shortdesc>
<content type="string" default="${OCF_RESKEY_pid_default}" />
</parameter>
<parameter name="amqp_server_port" unique="0" required="0">
<longdesc lang="en">
The listening port number of the AMQP server. Use for monitoring purposes
</longdesc>
<shortdesc lang="en">AMQP listening port</shortdesc>
<content type="integer" default="${OCF_RESKEY_amqp_server_port_default}" />
</parameter>
<parameter name="zeromq" unique="0" required="0">
<longdesc lang="en">
If zeromq is used, this will disable the connection test to the AMQP server. Use for monitoring purposes
</longdesc>
<shortdesc lang="en">Zero-MQ usage</shortdesc>
<content type="boolean" default="${OCF_RESKEY_zeromq_default}" />
</parameter>
<parameter name="additional_parameters" unique="0" required="0">
<longdesc lang="en">
Additional parameters to pass on to the OpenStack Orchestration Engine Service (heat-engine)
</longdesc>
<shortdesc lang="en">Additional parameters for heat-engine</shortdesc>
<content type="string" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="20" />
<action name="stop" timeout="20" />
<action name="status" timeout="20" />
<action name="monitor" timeout="30" interval="20" />
<action name="validate-all" timeout="5" />
<action name="meta-data" timeout="5" />
</actions>
</resource-agent>
END
}
#######################################################################
# Functions invoked by resource manager actions
heat_engine_check_port() {
# This function has been taken from the squid RA and improved a bit
# The length of the integer must be 4
# Examples of valid port: "1080", "0080"
# Examples of invalid port: "1080bad", "0", "0000", ""
local int
local cnt
int="$1"
cnt=${#int}
echo $int |egrep -qx '[0-9]+(:[0-9]+)?(,[0-9]+(:[0-9]+)?)*'
if [ $? -ne 0 ] || [ $cnt -ne 4 ]; then
ocf_log err "Invalid port number: $1"
exit $OCF_ERR_CONFIGURED
fi
}
heat_engine_validate() {
local rc
check_binary $OCF_RESKEY_binary
check_binary netstat
heat_engine_check_port $OCF_RESKEY_amqp_server_port
# A config file on shared storage that is not available
# during probes is OK.
if [ ! -f $OCF_RESKEY_config ]; then
if ! ocf_is_probe; then
ocf_log err "Config $OCF_RESKEY_config doesn't exist"
return $OCF_ERR_INSTALLED
fi
ocf_log_warn "Config $OCF_RESKEY_config not available during a probe"
fi
getent passwd $OCF_RESKEY_user >/dev/null 2>&1
rc=$?
if [ $rc -ne 0 ]; then
ocf_log err "User $OCF_RESKEY_user doesn't exist"
return $OCF_ERR_INSTALLED
fi
true
}
heat_engine_status() {
local pid
local rc
if [ ! -f $OCF_RESKEY_pid ]; then
ocf_log info "OpenStack Orchestration Engine (heat-engine) is not running"
return $OCF_NOT_RUNNING
else
pid=`cat $OCF_RESKEY_pid`
fi
ocf_run -warn kill -s 0 $pid
rc=$?
if [ $rc -eq 0 ]; then
return $OCF_SUCCESS
else
ocf_log info "Old PID file found, but OpenStack Orchestration Engine (heat-engine) is not running"
return $OCF_NOT_RUNNING
fi
}
heat_engine_monitor() {
local rc
local pid
local rc_amqp
local engine_amqp_check
heat_engine_status
rc=$?
# If status returned anything but success, return that immediately
if [ $rc -ne $OCF_SUCCESS ]; then
return $rc
fi
# Check the connections according to the PID.
# We are sure to hit the heat-engine process and not other heat process with the same connection behavior (for example heat-api)
if ocf_is_true "$OCF_RESKEY_zeromq"; then
pid=`cat $OCF_RESKEY_pid`
engine_amqp_check=`netstat -punt | grep -s "$OCF_RESKEY_amqp_server_port" | grep -s "$pid" | grep -qs "ESTABLISHED"`
rc_amqp=$?
if [ $rc_amqp -ne 0 ]; then
ocf_log err "Heat Engine is not connected to the AMQP server: AMQP connection test returned $rc_amqp"
return $OCF_NOT_RUNNING
fi
fi
ocf_log debug "OpenStack Orchestration Engine (heat-engine) monitor succeeded"
return $OCF_SUCCESS
}
heat_engine_start() {
local rc
heat_engine_status
rc=$?
if [ $rc -eq $OCF_SUCCESS ]; then
ocf_log info "OpenStack Orchestration Engine (heat-engine) already running"
return $OCF_SUCCESS
fi
# run the actual heat-engine daemon. Don't use ocf_run as we're sending the tool's output
# straight to /dev/null anyway and using ocf_run would break stdout-redirection here.
su ${OCF_RESKEY_user} -s /bin/sh -c "${OCF_RESKEY_binary} --config-file=$OCF_RESKEY_config \
$OCF_RESKEY_additional_parameters"' >> /dev/null 2>&1 & echo $!' > $OCF_RESKEY_pid
# Spin waiting for the server to come up.
while true; do
heat_engine_monitor
rc=$?
[ $rc -eq $OCF_SUCCESS ] && break
if [ $rc -ne $OCF_NOT_RUNNING ]; then
ocf_log err "OpenStack Orchestration Engine (heat-engine) start failed"
exit $OCF_ERR_GENERIC
fi
sleep 1
done
ocf_log info "OpenStack Orchestration Engine (heat-engine) started"
return $OCF_SUCCESS
}
heat_engine_stop() {
local rc
local pid
heat_engine_status
rc=$?
if [ $rc -eq $OCF_NOT_RUNNING ]; then
ocf_log info "OpenStack Orchestration Engine (heat-engine) already stopped"
return $OCF_SUCCESS
fi
# Try SIGTERM
pid=`cat $OCF_RESKEY_pid`
ocf_run kill -s TERM $pid
rc=$?
if [ $rc -ne 0 ]; then
ocf_log err "OpenStack Orchestration Engine (heat-engine) couldn't be stopped"
exit $OCF_ERR_GENERIC
fi
# stop waiting
shutdown_timeout=15
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5))
fi
count=0
while [ $count -lt $shutdown_timeout ]; do
heat_engine_status
rc=$?
if [ $rc -eq $OCF_NOT_RUNNING ]; then
break
fi
count=`expr $count + 1`
sleep 1
ocf_log debug "OpenStack Orchestration Engine (heat-engine) still hasn't stopped yet. Waiting ..."
done
heat_engine_status
rc=$?
if [ $rc -ne $OCF_NOT_RUNNING ]; then
# SIGTERM didn't help either, try SIGKILL
ocf_log info "OpenStack Orchestration Engine (heat-engine) failed to stop after ${shutdown_timeout}s \
using SIGTERM. Trying SIGKILL ..."
ocf_run kill -s KILL $pid
fi
ocf_log info "OpenStack Orchestration Engine (heat-engine) stopped"
rm -f $OCF_RESKEY_pid
return $OCF_SUCCESS
}
#######################################################################
case "$1" in
meta-data) meta_data
exit $OCF_SUCCESS;;
usage|help) usage
exit $OCF_SUCCESS;;
esac
# Anything except meta-data and help must pass validation
heat_engine_validate || exit $?
# What kind of method was invoked?
case "$1" in
start) heat_engine_start;;
stop) heat_engine_stop;;
status) heat_engine_status;;
monitor) heat_engine_monitor;;
validate-all) ;;
*) usage
exit $OCF_ERR_UNIMPLEMENTED;;
esac