
Directory-based OSDs are failing to deploy because 'python' has been replaced with 'python3' in the image. This change updates the python commands to use python3 instead. There is also a dependency on forego, which has been removed from the image. This change also modifies the deployment so that it doesn't depend on forego. Ownership of the OSD keyring file has also been changed so that it is owned by the 'ceph' user, and the ceph-osd process now uses --setuser and --setgroup to run as the same user. Change-Id: If825df283bca0b9f54406084ac4b8f958a69eab7
261 lines
11 KiB
Smarty
261 lines
11 KiB
Smarty
#!/bin/bash
|
|
|
|
{{/*
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/}}
|
|
|
|
set -ex
|
|
export PS4='+${BASH_SOURCE:+$(basename ${BASH_SOURCE}):${LINENO}:}${FUNCNAME:+${FUNCNAME}():} '
|
|
|
|
: "${CRUSH_LOCATION:=root=default host=${HOSTNAME}}"
|
|
: "${OSD_PATH_BASE:=/var/lib/ceph/osd/${CLUSTER}}"
|
|
: "${CEPH_CONF:="/etc/ceph/${CLUSTER}.conf"}"
|
|
: "${OSD_BOOTSTRAP_KEYRING:=/var/lib/ceph/bootstrap-osd/${CLUSTER}.keyring}"
|
|
: "${OSD_JOURNAL_UUID:=$(uuidgen)}"
|
|
: "${OSD_JOURNAL_SIZE:=$(awk '/^osd_journal_size/{print $3}' ${CEPH_CONF}.template)}"
|
|
: "${OSD_WEIGHT:=1.0}"
|
|
|
|
eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python3 -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))')
|
|
eval CRUSH_FAILURE_DOMAIN_NAME=$(cat /etc/ceph/storage.json | python3 -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_name"]))')
|
|
eval CRUSH_FAILURE_DOMAIN_BY_HOSTNAME=$(cat /etc/ceph/storage.json | python3 -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_by_hostname"]))')
|
|
eval CRUSH_FAILURE_DOMAIN_FROM_HOSTNAME_MAP=$(cat /etc/ceph/storage.json | jq '.failure_domain_by_hostname_map."'$HOSTNAME'"')
|
|
eval DEVICE_CLASS=$(cat /etc/ceph/storage.json | python3 -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["device_class"]))')
|
|
|
|
if [[ $(ceph -v | egrep "octopus|nautilus|mimic|luminous" > /dev/null 2>&1; echo $?) -ne 0 ]]; then
|
|
echo "ERROR- need Luminous/Mimic/Nautilus/Octopus release"
|
|
exit 1
|
|
fi
|
|
|
|
if [ -z "${HOSTNAME}" ]; then
|
|
echo "HOSTNAME not set; This will prevent to add an OSD into the CRUSH map"
|
|
exit 1
|
|
fi
|
|
|
|
if [[ ! -e ${CEPH_CONF}.template ]]; then
|
|
echo "ERROR- ${CEPH_CONF}.template must exist; get it from your existing mon"
|
|
exit 1
|
|
else
|
|
ENDPOINT=$(kubectl get endpoints ceph-mon-discovery -n ${NAMESPACE} -o json | awk -F'"' -v port=${MON_PORT} \
|
|
-v version=v1 -v msgr_version=v2 \
|
|
-v msgr2_port=${MON_PORT_V2} \
|
|
'/"ip"/{print "["version":"$4":"port"/"0","msgr_version":"$4":"msgr2_port"/"0"]"}' | paste -sd',')
|
|
if [[ "${ENDPOINT}" == "" ]]; then
|
|
/bin/sh -c -e "cat ${CEPH_CONF}.template | tee ${CEPH_CONF}" || true
|
|
else
|
|
/bin/sh -c -e "cat ${CEPH_CONF}.template | sed 's#mon_host.*#mon_host = ${ENDPOINT}#g' | tee ${CEPH_CONF}" || true
|
|
fi
|
|
fi
|
|
|
|
# Wait for a file to exist, regardless of the type
|
|
function wait_for_file {
|
|
timeout 10 bash -c "while [ ! -e ${1} ]; do echo 'Waiting for ${1} to show up' && sleep 1 ; done"
|
|
}
|
|
|
|
function is_available {
|
|
command -v $@ &>/dev/null
|
|
}
|
|
|
|
function ceph_cmd_retry() {
|
|
cnt=0
|
|
until "ceph" "$@" || [ $cnt -ge 6 ]; do
|
|
sleep 10
|
|
((cnt++))
|
|
done
|
|
}
|
|
|
|
function crush_create_or_move {
|
|
local crush_location=${1}
|
|
ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
|
osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${crush_location}
|
|
}
|
|
|
|
function crush_add_and_move {
|
|
local crush_failure_domain_type=${1}
|
|
local crush_failure_domain_name=${2}
|
|
local crush_location=$(echo "root=default ${crush_failure_domain_type}=${crush_failure_domain_name} host=${HOSTNAME}")
|
|
crush_create_or_move "${crush_location}"
|
|
local crush_failure_domain_location_check=$(ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | grep "${crush_failure_domain_type}" | awk -F '"' '{print $4}')
|
|
if [ "x${crush_failure_domain_location_check}" != "x${crush_failure_domain_name}" ]; then
|
|
# NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations
|
|
# as create-or-move may not appropiately move them.
|
|
ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
|
osd crush add-bucket "${crush_failure_domain_name}" "${crush_failure_domain_type}" || true
|
|
ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
|
osd crush move "${crush_failure_domain_name}" root=default || true
|
|
ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
|
osd crush move "${HOSTNAME}" "${crush_failure_domain_type}=${crush_failure_domain_name}" || true
|
|
fi
|
|
}
|
|
|
|
function crush_location {
|
|
set_device_class
|
|
if [ "x${CRUSH_FAILURE_DOMAIN_TYPE}" != "xhost" ]; then
|
|
if [ "x${CRUSH_FAILURE_DOMAIN_NAME}" != "xfalse" ]; then
|
|
crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "${CRUSH_FAILURE_DOMAIN_NAME}"
|
|
elif [ "x${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}" != "xfalse" ]; then
|
|
crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "$(echo ${CRUSH_FAILURE_DOMAIN_TYPE}_$(echo ${HOSTNAME} | cut -c ${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}))"
|
|
elif [ "x${CRUSH_FAILURE_DOMAIN_FROM_HOSTNAME_MAP}" != "xnull" ]; then
|
|
crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "${CRUSH_FAILURE_DOMAIN_FROM_HOSTNAME_MAP}"
|
|
else
|
|
# NOTE(supamatt): neither variables are defined then we fall back to default behavior
|
|
crush_create_or_move "${CRUSH_LOCATION}"
|
|
fi
|
|
else
|
|
crush_create_or_move "${CRUSH_LOCATION}"
|
|
fi
|
|
}
|
|
|
|
# Calculate proper device names, given a device and partition number
|
|
function dev_part {
|
|
local osd_device=${1}
|
|
local osd_partition=${2}
|
|
|
|
if [[ -L ${osd_device} ]]; then
|
|
# This device is a symlink. Work out it's actual device
|
|
local actual_device=$(readlink -f "${osd_device}")
|
|
local bn=$(basename "${osd_device}")
|
|
if [[ "${actual_device:0-1:1}" == [0-9] ]]; then
|
|
local desired_partition="${actual_device}p${osd_partition}"
|
|
else
|
|
local desired_partition="${actual_device}${osd_partition}"
|
|
fi
|
|
# Now search for a symlink in the directory of $osd_device
|
|
# that has the correct desired partition, and the longest
|
|
# shared prefix with the original symlink
|
|
local symdir=$(dirname "${osd_device}")
|
|
local link=""
|
|
local pfxlen=0
|
|
for option in ${symdir}/*; do
|
|
[[ -e $option ]] || break
|
|
if [[ $(readlink -f "${option}") == "${desired_partition}" ]]; then
|
|
local optprefixlen=$(prefix_length "${option}" "${bn}")
|
|
if [[ ${optprefixlen} > ${pfxlen} ]]; then
|
|
link=${symdir}/${option}
|
|
pfxlen=${optprefixlen}
|
|
fi
|
|
fi
|
|
done
|
|
if [[ $pfxlen -eq 0 ]]; then
|
|
>&2 echo "Could not locate appropriate symlink for partition ${osd_partition} of ${osd_device}"
|
|
exit 1
|
|
fi
|
|
echo "$link"
|
|
elif [[ "${osd_device:0-1:1}" == [0-9] ]]; then
|
|
echo "${osd_device}p${osd_partition}"
|
|
else
|
|
echo "${osd_device}${osd_partition}"
|
|
fi
|
|
}
|
|
|
|
function zap_extra_partitions {
|
|
# Examine temp mount and delete any block.db and block.wal partitions
|
|
mountpoint=${1}
|
|
journal_disk=""
|
|
journal_part=""
|
|
block_db_disk=""
|
|
block_db_part=""
|
|
block_wal_disk=""
|
|
block_wal_part=""
|
|
|
|
# Discover journal, block.db, and block.wal partitions first before deleting anything
|
|
# If the partitions are on the same disk, deleting one can affect discovery of the other(s)
|
|
if [ -L "${mountpoint}/journal" ]; then
|
|
journal_disk=$(readlink -m ${mountpoint}/journal | sed 's/[0-9]*//g')
|
|
journal_part=$(readlink -m ${mountpoint}/journal | sed 's/[^0-9]*//g')
|
|
fi
|
|
if [ -L "${mountpoint}/block.db" ]; then
|
|
block_db_disk=$(readlink -m ${mountpoint}/block.db | sed 's/[0-9]*//g')
|
|
block_db_part=$(readlink -m ${mountpoint}/block.db | sed 's/[^0-9]*//g')
|
|
fi
|
|
if [ -L "${mountpoint}/block.wal" ]; then
|
|
block_wal_disk=$(readlink -m ${mountpoint}/block.wal | sed 's/[0-9]*//g')
|
|
block_wal_part=$(readlink -m ${mountpoint}/block.wal | sed 's/[^0-9]*//g')
|
|
fi
|
|
|
|
# Delete any discovered journal, block.db, and block.wal partitions
|
|
if [ ! -z "${journal_disk}" ]; then
|
|
sgdisk -d ${journal_part} ${journal_disk}
|
|
/sbin/udevadm settle --timeout=600
|
|
/usr/bin/flock -s ${journal_disk} /sbin/partprobe ${journal_disk}
|
|
/sbin/udevadm settle --timeout=600
|
|
fi
|
|
if [ ! -z "${block_db_disk}" ]; then
|
|
sgdisk -d ${block_db_part} ${block_db_disk}
|
|
/sbin/udevadm settle --timeout=600
|
|
/usr/bin/flock -s ${block_db_disk} /sbin/partprobe ${block_db_disk}
|
|
/sbin/udevadm settle --timeout=600
|
|
fi
|
|
if [ ! -z "${block_wal_disk}" ]; then
|
|
sgdisk -d ${block_wal_part} ${block_wal_disk}
|
|
/sbin/udevadm settle --timeout=600
|
|
/usr/bin/flock -s ${block_wal_disk} /sbin/partprobe ${block_wal_disk}
|
|
/sbin/udevadm settle --timeout=600
|
|
fi
|
|
}
|
|
|
|
function disk_zap {
|
|
# Run all the commands that ceph-disk zap uses to clear a disk
|
|
local device=${1}
|
|
wipefs --all ${device}
|
|
# Wipe the first 200MB boundary, as Bluestore redeployments will not work otherwise
|
|
dd if=/dev/zero of=${device} bs=1M count=200
|
|
sgdisk --zap-all -- ${device}
|
|
sgdisk --clear --mbrtogpt -- ${device}
|
|
}
|
|
|
|
function udev_settle {
|
|
partprobe "${OSD_DEVICE}"
|
|
if [ "${OSD_BLUESTORE:-0}" -eq 1 ]; then
|
|
if [ ! -z "$BLOCK_DB" ]; then
|
|
partprobe "${BLOCK_DB}"
|
|
fi
|
|
if [ ! -z "$BLOCK_WAL" ] && [ "$BLOCK_WAL" != "$BLOCK_DB" ]; then
|
|
partprobe "${BLOCK_WAL}"
|
|
fi
|
|
else
|
|
if [ "x$JOURNAL_TYPE" == "xblock-logical" ] && [ ! -z "$OSD_JOURNAL" ]; then
|
|
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
|
|
if [ ! -z "$OSD_JOURNAL" ]; then
|
|
local JDEV=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
|
|
partprobe "${JDEV}"
|
|
fi
|
|
fi
|
|
fi
|
|
# watch the udev event queue, and exit if all current events are handled
|
|
udevadm settle --timeout=600
|
|
|
|
# On occassion udev may not make the correct device symlinks for Ceph, just in case we make them manually
|
|
mkdir -p /dev/disk/by-partuuid
|
|
for dev in $(awk '!/rbd/{print $4}' /proc/partitions | grep "[0-9]"); do
|
|
diskdev=$(echo "${dev//[!a-z]/}")
|
|
partnum=$(echo "${dev//[!0-9]/}")
|
|
ln -s "../../${dev}" "/dev/disk/by-partuuid/$(sgdisk -i ${partnum} /dev/${diskdev} | awk '/Partition unique GUID/{print tolower($4)}')" || true
|
|
done
|
|
}
|
|
|
|
function set_device_class {
|
|
if [ ! -z "$DEVICE_CLASS" ]; then
|
|
if [ "x$DEVICE_CLASS" != "x$(get_device_class)" ]; then
|
|
ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
|
osd crush rm-device-class "osd.${OSD_ID}"
|
|
ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
|
osd crush set-device-class "${DEVICE_CLASS}" "osd.${OSD_ID}"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
function get_device_class {
|
|
echo $(ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
|
osd crush get-device-class "osd.${OSD_ID}")
|
|
}
|