stability fixes for vault-manager rekey
Continue/complete the rekey procedure when vault-manager is interrupted (kill -9). Fixes include: - Refactor logic of rekeyRecover function - additionally handle specific failure scenarios to permit the rekey procedure to continue - correct return codes of procedure functions to fall through to the recovery procedure - resort the tests of needsShuffle - misc adjustment of logs and comments The additional handling of failure scenarios includes: - partial deletion of cluster-rekey secrets after copying to cluster-key - restart rekey on failure during authentication Test Plan: PASS vault sanity, ha sanity PASS IPv4 and IPv6 PASS system application-update, and platform application update PASS rekey operation without interuption PASS bashate the rendered init.sh Stability testing includes kubectl deleting pods and kill -9 processes during rekey operation at intervals spread across the procedure, with slight random time added to each interval PASS delete a standby vault server pod PASS delete the active vault server pod PASS delete the vault-manager pod PASS delete the vault-manager pod and a random vault server pod PASS delete the vault-manager pod and the active pod PASS delete the vault-manager pod and a standby pod PASS kill -9 vault-manager process PASS kill -9 active vault server process PASS kill -9 standby vault server process PASS kill -9 random selection of vault and vault-manager processes Story: 2010930 Task: 49174 Change-Id: I508e93a36de9ca8b4c8fa1da7941fe49936de159 Signed-off-by: Michel Thebeau <Michel.Thebeau@windriver.com>
This commit is contained in:
parent
615d6e4657
commit
be0e85ec77
@ -2000,7 +2000,8 @@ data:
|
||||
cluster-rekey-shuffle \
|
||||
cluster-rekey-audit
|
||||
if [ $? -eq 0 ]; then
|
||||
log $WARNING "Check initialization out of order"
|
||||
# this presents a recovery procedure for one of those
|
||||
# procedure steps
|
||||
return 1
|
||||
fi
|
||||
|
||||
@ -2349,7 +2350,7 @@ data:
|
||||
cluster-rekey-audit
|
||||
if [ $? -eq 0 ]; then
|
||||
log $WARNING "Check verify out of order"
|
||||
return 2
|
||||
return 1
|
||||
fi
|
||||
|
||||
return 0
|
||||
@ -2515,9 +2516,24 @@ data:
|
||||
assertRekeyStarted --not
|
||||
progress=$?
|
||||
if [ "$progress" -ne 0 ]; then
|
||||
# 1 - maintain the status of rekey in progress
|
||||
# 2 - api error, try again later
|
||||
return "$progress"
|
||||
fi
|
||||
|
||||
secretExists cluster-rekey-verified >/dev/null
|
||||
if [ $? -ne 0 ]; then
|
||||
# proceeds to check procedure step
|
||||
return 1
|
||||
fi
|
||||
|
||||
secretsExistAny cluster-rekey-shuffle \
|
||||
cluster-rekey-audit
|
||||
if [ $? -eq 0 ]; then
|
||||
# proceed to recovery
|
||||
return 1
|
||||
fi
|
||||
|
||||
assertShardSecrets cluster-rekey
|
||||
case $? in
|
||||
0)
|
||||
@ -2530,32 +2546,17 @@ data:
|
||||
return 1
|
||||
;;
|
||||
*)
|
||||
log $ERROR "The number key shared secrets for" \
|
||||
# with cluster-rekey-verified, an incomplete set of
|
||||
# cluster-rekey indicates partial deletion after copying
|
||||
# to cluster-key
|
||||
# will want to audit the cluster-key secrets before
|
||||
# deleting cluster-rekey
|
||||
log $WARNING "The number key shard secrets for" \
|
||||
"cluster-rekey is not complete"
|
||||
return 2
|
||||
return 1
|
||||
;;
|
||||
esac
|
||||
|
||||
secretExists cluster-rekey-verified >/dev/null
|
||||
if [ $? -ne 0 ]; then
|
||||
# is it possible that the vault server cancelled the rekey
|
||||
# request?
|
||||
# There is no rekey in progress, and there is a set of
|
||||
# cluster-rekey secrets, but vault-manager hasn't verified
|
||||
#
|
||||
# Or did vault-manager exit unexpectedly
|
||||
return 1
|
||||
fi
|
||||
|
||||
# normally should not need to check, but useful when
|
||||
# running manually
|
||||
secretsExistAny cluster-rekey-shuffle \
|
||||
cluster-rekey-audit
|
||||
if [ $? -eq 0 ]; then
|
||||
log $WARNING "Check cluster-key shuffle out of order"
|
||||
return 2
|
||||
fi
|
||||
|
||||
# otherwise allow rekeyShuffleKeys to be re-entrant to
|
||||
# the existance of or lack of cluster-key and cluster-key-bk
|
||||
# cluster-rekey is only deleted when confirmed to be copied to
|
||||
@ -2757,8 +2758,8 @@ data:
|
||||
|
||||
secretExists cluster-rekey-audit >/dev/null
|
||||
if [ $? -eq 0 ]; then
|
||||
log $ERROR "rekey audit already completed"
|
||||
return 1
|
||||
log $INFO "rekey audit already completed"
|
||||
return 3
|
||||
fi
|
||||
|
||||
assertShardSecrets cluster-key
|
||||
@ -2900,6 +2901,7 @@ data:
|
||||
fi
|
||||
assertServerStatus "$REKEY_STATUS_JSON"
|
||||
if [ $? -ne 0 ]; then
|
||||
# wait for the vault servers to sync
|
||||
return 1
|
||||
fi
|
||||
|
||||
@ -2908,6 +2910,9 @@ data:
|
||||
| jq -r '.verification_nonce' )"
|
||||
if [ "$inprogress" == "true" ]; then
|
||||
# If a rekey is in progress, then cancel it
|
||||
# - an authentication will reinitialize
|
||||
# - a verification will reinitialtize
|
||||
# - a rekeyAudit will retry
|
||||
log $INFO "Cancelling rekey in progress"
|
||||
NO_HEADER=true \
|
||||
API_TMOUT=$API_REKEY_OP_TMOUT \
|
||||
@ -2932,81 +2937,101 @@ data:
|
||||
secretExists cluster-rekey-audit >/dev/null
|
||||
audit_exists=$?
|
||||
|
||||
# Handle condition where secrets were shuffled but vault-manager
|
||||
# failed before recording the milestone cluster-rekey-shuffle
|
||||
if [ "$verified_exists" -eq 0 \
|
||||
-a "$shuffle_exists" -ne 0 ]; then
|
||||
if [ "$rekey_exists" -eq "$KEY_SECRET_SHARES" ]; then
|
||||
# review each of the milestones to discern the failure point
|
||||
if [ "$audit_exists" -eq 0 ]; then
|
||||
true
|
||||
# no recovery options here
|
||||
# pass through
|
||||
elif [ "$shuffle_exists" -eq 0 ]; then
|
||||
true
|
||||
# no recovery options here
|
||||
# pass through
|
||||
elif [ "$verified_exists" -eq 0 ]; then
|
||||
if [ "$rekey_exists" -gt 0 ]; then
|
||||
if [ "$rekey_exists" -lt "$KEY_SECRET_SHARES" ]; then
|
||||
# with verified_exists, indicates partial deletion
|
||||
# of the cluster-rekey secrets after copying to
|
||||
# cluster-key. Audit the cluster-key secrets before
|
||||
# deleting rekey
|
||||
rekeyAudit cluster-key
|
||||
if [ $? -ne 0 ]; then
|
||||
log $ERROR "Audit cluster-key fails with a" \
|
||||
"partial set of cluster-rekey"
|
||||
return 1
|
||||
fi
|
||||
|
||||
deleteShardSecrets cluster-rekey
|
||||
fi
|
||||
|
||||
# Handle condition where secrets were shuffled but
|
||||
# vault-manager failed before recording the
|
||||
# milestone cluster-rekey-shuffle
|
||||
|
||||
# auditRekey will double-check that cluster-key is
|
||||
# in use
|
||||
set_secret cluster-rekey-shuffle /dev/stdin \
|
||||
<<<"$( get_secret cluster-rekey-request )"
|
||||
log $INFO "Continuing rekey procedure with audit" \
|
||||
"of cluster-key"
|
||||
return
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
# else: pass through
|
||||
else
|
||||
if [ "$rekey_exists" -eq 0 ]; then
|
||||
# Handle condition where an active server fails during
|
||||
# verification: vault may have cancelled the rekey procedure
|
||||
|
||||
# This question is: which shards are the vault servers
|
||||
# using?
|
||||
log $INFO "Recovering from mismatch of cluster-rekey" \
|
||||
"and verified status"
|
||||
|
||||
# Handle condition where verification is in progress but
|
||||
# vault-manager did not store shards. The rekey was canceled
|
||||
# above
|
||||
if [ "$inprogress" == "true" ]; then
|
||||
if [ "$rekey_exists" -gt 0 ]; then
|
||||
# cluster-rekey secrets do not exist or
|
||||
# partial set exists
|
||||
secretsExistAny cluster-rekey-verified \
|
||||
cluster-rekey-shuffle \
|
||||
cluster-rekey-audit
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
# the rekey procedure will restart
|
||||
if [ "$rekey_exists" -lt "$KEY_SECRET_SHARES" ]; then
|
||||
log $INFO "Deleting partial set of" \
|
||||
"cluster-rekey secrets"
|
||||
deleteShardSecrets cluster-rekey
|
||||
fi
|
||||
return
|
||||
# Audit the existing shards to see which ones the
|
||||
# vault servers are keyed for.
|
||||
# Most likely that the verification failed due to
|
||||
# active server failing, start with cluster-key
|
||||
rekeyAudit cluster-key
|
||||
if [ $? -eq 0 ]; then
|
||||
# The rekey verification did not complete
|
||||
# remove cluster-rekey secrets
|
||||
# The rekey procedure should restart
|
||||
deleteShardSecrets cluster-rekey
|
||||
log $INFO "Restart rekey procedure"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# this happens when vault-manager process is killed
|
||||
rekeyAudit cluster-rekey
|
||||
if [ $? -eq 0 ]; then
|
||||
set_secret cluster-rekey-verified /dev/null \
|
||||
<<<$( get_secret cluster-rekey-request )
|
||||
log $INFO "Continue rekey procedure with cluster-rekey"
|
||||
return 0
|
||||
fi
|
||||
# else: pass through
|
||||
elif [ "$rekey_exists" -eq 5 ]; then
|
||||
# There are no cluster-rekey secrets; and the rekey is
|
||||
# cancelled: the rekey procedure will restart
|
||||
log $INFO "Continue rekey procedure with initialization"
|
||||
return 0
|
||||
else # cluster-rekey secrets are incomplete
|
||||
# Handle condition where verification is needed but
|
||||
# vault-manager did not store shards. The rekey was
|
||||
# canceled above
|
||||
|
||||
# assert cluster-key before deleteing rekey
|
||||
rekeyAudit cluster-key
|
||||
if [ $? -eq 0 ]; then
|
||||
# the rekey procedure will restart
|
||||
log $INFO "Deleting partial set of" \
|
||||
"cluster-rekey secrets"
|
||||
deleteShardSecrets cluster-rekey
|
||||
return 0
|
||||
fi
|
||||
# else: pass through
|
||||
fi
|
||||
fi
|
||||
|
||||
# Handle condition where an active server fails during
|
||||
# verification: vault may have cancelled the rekey procedure
|
||||
if [ "$rekey_exists" -eq 0 \
|
||||
-a "$verified_exists" -ne 0 ]; then
|
||||
|
||||
# This question is: which shards are the vault servers
|
||||
# using?
|
||||
log $INFO "Recovering from mismatch of cluster-rekey" \
|
||||
"and verified status"
|
||||
|
||||
# Audit the existing shards to see which ones the
|
||||
# vault servers are keyed for.
|
||||
# Most likely that the verification failed due to
|
||||
# active server failing, start with cluster-key
|
||||
rekeyAudit cluster-key
|
||||
if [ $? -eq 0 ]; then
|
||||
# The rekey verification did not complete
|
||||
# remove cluster-rekey secrets
|
||||
# The rekey procedure should restart
|
||||
deleteShardSecrets cluster-rekey
|
||||
log $INFO "Restart rekey procedure"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# this happens when vault-manager process is killed
|
||||
rekeyAudit cluster-rekey
|
||||
if [ $? -eq 0 ]; then
|
||||
set_secret cluster-rekey-verified /dev/null \
|
||||
<<<$( get_secret cluster-rekey-request )
|
||||
log $INFO "Continue rekey procedure with cluster-rekey"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# try again later
|
||||
return 1
|
||||
fi
|
||||
log $ERROR "Did not recover from current rekey status"
|
||||
}
|
||||
|
||||
@ -3060,6 +3085,19 @@ data:
|
||||
;;
|
||||
1) # continue to procedure step
|
||||
;;
|
||||
3) # audit is already completed
|
||||
secretExists cluster-rekey-audit >/dev/null
|
||||
if [ $? -eq 0 ]; then
|
||||
# the cluster-key secrets were audit, but vault
|
||||
# manager didn't get a chance to set
|
||||
# cluster-rekey-audit milestone
|
||||
finalizeRekey
|
||||
return
|
||||
fi
|
||||
log $ERROR "Discrepancy between needsAudit and" \
|
||||
"rekeyVault"
|
||||
return
|
||||
;;
|
||||
*)
|
||||
# an error occurs for which the procedure should not
|
||||
# continue
|
||||
|
Loading…
x
Reference in New Issue
Block a user