Merge "A script to check the failure domains of OSDs in PGs"

2018-09-06 19:30:10 +00:00 · 2018-09-06 19:30:10 +00:00 · 8df4402bec
commit 8df4402bec
parent fc65b1af95 d6cfd78c4d
11 changed files with 1633 additions and 0 deletions
--- a/ceph-mon/templates/bin/utils/_checkPGs.py.tpl
+++ b/ceph-mon/templates/bin/utils/_checkPGs.py.tpl
@ -0,0 +1,256 @@
+#!/usr/bin/python2
+
+import subprocess
+import json
+import sys
+from argparse import *
+
+class cephCRUSH():
+    """
+    Currently, this script is coded to work with the ceph clusters that have
+    these type-ids -- osd, host, rack, root.  To add other type_ids to the
+    CRUSH map, this script needs enhancements to include the new type_ids.
+
+    type_id name
+    ------- ----
+          0 osd
+          1 host
+          2 chassis
+          3 rack
+          4 row
+          5 pdu
+          6 pod
+          7 room
+          8 datacenter
+          9 region
+         10 root
+
+    Ceph organizes the CRUSH map in hierarchical topology.  At the top, it is
+    the root.  The next levels are racks, hosts, and OSDs, respectively.  The
+    OSDs are at the leaf level.  This script looks at OSDs in each placement
+    group of a ceph pool.  For each OSD, starting from the OSD leaf level, this
+    script traverses up to the root.  Along the way, the host and rack are
+    recorded and then verified to make sure the paths to the root are in
+    separate failure domains.  This script reports the offending PGs to stdout.
+    """
+
+    """
+    This list stores the ceph crush hierarchy retrieved from the
+    ceph osd crush tree -f json-pretty
+    """
+    crushHierarchy = []
+
+    """
+    Failure Domains - currently our crush map uses these type IDs - osd,
+    host, rack, root
+    If we need to add chassis type (or other types) later on, add the
+    type to the if statement in the crushFD construction section.
+
+    crushFD[0] = {'id': -2, 'name': 'host1', 'type': 'host'}
+    crushFD[23] = {'id': -5, 'name': 'host2', 'type': 'host'}
+    crushFD[68] = {'id': -7, 'name': 'host3', 'type': 'host'}
+    rack_FD[-2] = {'id': -9, 'name': 'rack1', 'type': 'rack' }
+    rack_FD[-15] = {'id': -17, 'name': 'rack2', 'type': 'rack' }
+    root_FD[-17] = {'id': -1, 'name': 'default', 'type': 'root' }}
+    root_FD[-9] = {'id': -1, 'name': 'default', 'type': 'root' }}
+    """
+    crushFD = {}
+
+    def __init__(self, poolName):
+        if 'all' in poolName or 'All' in poolName:
+            try:
+                poolLs = 'ceph osd pool ls -f json-pretty'
+                poolstr = subprocess.check_output(poolLs, shell=True)
+                self.listPoolName = json.loads(poolstr)
+            except subprocess.CalledProcessError as e:
+                print('{}'.format(e))
+                """Unable to get all pools - cannot proceed"""
+                sys.exit(2)
+        else:
+            self.listPoolName = poolName
+
+        try:
+            """Retrieve the crush hierarchies"""
+            crushTree = "ceph osd crush tree -f json-pretty | grep -v '^\[\]'"
+            chstr = subprocess.check_output(crushTree, shell=True)
+            self.crushHierarchy = json.loads(chstr)
+        except subprocess.CalledProcessError as e:
+            print('{}'.format(e))
+            """Unable to get crush hierarchy - cannot proceed"""
+            sys.exit(2)
+
+        """
+        Number of racks configured in the ceph cluster.  The racks that are
+        present in the crush hierarchy may not be used.  The un-used rack
+        would not show up in the crushFD.
+        """
+        self.count_racks = 0
+
+        """depth level - 3 is OSD, 2 is host, 1 is rack, 0 is root"""
+        self.osd_depth = 0
+        """Construct the Failure Domains - OSD -> Host -> Rack -> Root"""
+        for chitem in self.crushHierarchy:
+            if chitem['type'] == 'host' or \
+               chitem['type'] == 'rack' or \
+               chitem['type'] == 'root':
+                for child in chitem['children']:
+                    self.crushFD[child] = {'id': chitem['id'], 'name': chitem['name'], 'type': chitem['type']}
+                if chitem['type'] == 'rack' and len(chitem['children']) > 0:
+                    self.count_racks += 1
+            elif chitem['type'] == 'osd':
+                if self.osd_depth == 0:
+                    self.osd_depth = chitem['depth']
+
+        """[ { 'pg-name' : [osd.1, osd.2, osd.3] } ... ]"""
+        self.poolPGs = []
+        """Replica of the pool.  Initialize to 0."""
+        self.poolSize = 0
+
+    def getPoolSize(self, poolName):
+        """
+        size (number of replica) is an attribute of a pool
+        { "pool": "rbd", "pool_id": 1, "size": 3 }
+        """
+        pSize = {}
+        """Get the size attribute of the poolName"""
+        try:
+            poolGet = 'ceph osd pool get ' + poolName + ' size -f json-pretty'
+            szstr = subprocess.check_output(poolGet, shell=True)
+            pSize = json.loads(szstr)
+            self.poolSize = pSize['size']
+        except subprocess.CalledProcessError as e:
+            print('{}'.format(e))
+            self.poolSize = 0
+            """Continue on"""
+        return
+
+    def checkPGs(self, poolName):
+        if not len(self.poolPGs) > 0:
+            return
+        print('Checking PGs in pool {} ...'.format(poolName)),
+        badPGs = False
+        for pg in self.poolPGs:
+            osdUp = pg['up']
+            """
+            Construct the OSD path from the leaf to the root.  If the
+            replica is set to 3 and there are 3 racks.  Each OSD has its
+            own rack (failure domain).   If more than one OSD has the
+            same rack, this is a violation.  If the number of rack is
+            one, then we need to make sure the hosts for the three OSDs
+            are different.
+            """
+            check_FD = {}
+            checkFailed = False
+            for osd in osdUp:
+                traverseID = osd
+                """Start the level with 1 to include the OSD leaf"""
+                traverseLevel = 1
+                while (self.crushFD[traverseID]['type'] != 'root'):
+                    crushType = self.crushFD[traverseID]['type']
+                    crushName = self.crushFD[traverseID]['name']
+                    if crushType in check_FD:
+                        check_FD[crushType].append(crushName)
+                    else:
+                        check_FD[crushType] = [crushName]
+                    """traverse up (to the root) one level"""
+                    traverseID = self.crushFD[traverseID]['id']
+                    traverseLevel += 1
+                assert (traverseLevel == self.osd_depth), "OSD depth mismatch"
+            """
+            check_FD should have
+            {
+             'host': ['host1', 'host2', 'host3', 'host4'],
+             'rack': ['rack1', 'rack2', 'rack3']
+            }
+            Not checking for the 'root' as there is only one root.
+            """
+            for ktype in check_FD:
+                kvalue = check_FD[ktype]
+                if ktype == 'host':
+                    """
+                    At the host level, every OSD should come from different
+                    host.  It is a violation if duplicate hosts are found.
+                    """
+                    if len(kvalue) != len(set(kvalue)):
+                        if not badPGs:
+                            print('Failed')
+                        badPGs = True
+                        print('OSDs {} in PG {} failed check in host {}'.format(pg['up'], pg['pgid'], kvalue))
+                elif ktype == 'rack':
+                    if len(kvalue) == len(set(kvalue)):
+                        continue
+                    else:
+                        """
+                        There are duplicate racks.  This could be due to
+                        situation like pool's size is 3 and there are only
+                        two racks (or one rack).  OSDs should come from
+                        different hosts as verified in the 'host' section.
+                        """
+                        if self.count_racks == len(set(kvalue)):
+                            continue
+                        elif self.count_racks > len(set(kvalue)):
+                            """Not all the racks were used to allocate OSDs"""
+                            if not badPGs:
+                                print('Failed')
+                            badPGs = True
+                            print('OSDs {} in PG {} failed check in rack {}'.format(pg['up'], pg['pgid'], kvalue))
+            check_FD.clear()
+        if not badPGs:
+            print('Passed')
+        return
+
+    def checkPoolPGs(self):
+        for pool in self.listPoolName:
+            self.getPoolSize(pool)
+            if self.poolSize == 1:
+                """No need to check pool with the size set to 1 copy"""
+                print('Checking PGs in pool {} ... {}'.format(pool, 'Skipped'))
+                continue
+            elif self.poolSize == 0:
+                print('Pool {} was not found.'.format(pool))
+                continue
+            assert (self.poolSize > 1), "Pool size was incorrectly set"
+
+            try:
+                """Get the list of PGs in the pool"""
+                lsByPool = 'ceph pg ls-by-pool ' + pool + ' -f json-pretty'
+                pgstr = subprocess.check_output(lsByPool, shell=True)
+                self.poolPGs = json.loads(pgstr)
+                """Check that OSDs in the PG are in separate failure domains"""
+                self.checkPGs(pool)
+            except subprocess.CalledProcessError as e:
+                print('{}'.format(e))
+                """Continue to the next pool (if any)"""
+        return
+
+def Main():
+    parser = ArgumentParser(description='''
+Cross-check the OSDs assigned to the Placement Groups (PGs) of a ceph pool
+with the CRUSH topology.  The cross-check compares the OSDs in a PG and
+verifies the OSDs reside in separate failure domains.  PGs with OSDs in
+the same failure domain are flagged as violation.  The offending PGs are
+printed to stdout.
+
+This CLI is executed on-demand on a ceph-mon pod.  To invoke the CLI, you
+can specify one pool or list of pools to check.  The special pool name
+All (or all) checks all the pools in the ceph cluster.
+''',
+    formatter_class=RawTextHelpFormatter)
+    parser.add_argument('PoolName', type=str, nargs='+',
+      help='List of pools (or All) to validate the PGs and OSDs mapping')
+    args = parser.parse_args()
+
+    if ('all' in args.PoolName or
+        'All' in args.PoolName) and len(args.PoolName) > 1:
+        print('You only need to give one pool with special pool All')
+        sys.exit(1)
+
+    """
+    Retrieve the crush hierarchies and store it.  Cross-check the OSDs
+    in each PG searching for failure domain violation.
+    """
+    ccm = cephCRUSH(args.PoolName)
+    ccm.checkPoolPGs()
+
+if __name__ == '__main__':
+    Main()
--- a/ceph-mon/templates/bin/utils/_checkPGs.sh.tpl
+++ b/ceph-mon/templates/bin/utils/_checkPGs.sh.tpl
@ -0,0 +1,23 @@
+#!/bin/bash
+
+{{/*
+Copyright 2018 The Openstack-Helm Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/}}
+
+set -ex
+
+monPod=$(kubectl get pods --namespace=${DEPLOYMENT_NAMESPACE} --selector=application=ceph --selector=component=mon --output=jsonpath={.items[0].metadata.name} 2>/dev/null)
+
+kubectl exec -t ${monPod} --namespace=${DEPLOYMENT_NAMESPACE} -- /tmp/utils-checkPGs.py All 2>/dev/null
--- a/ceph-mon/templates/configmap-bin.yaml
+++ b/ceph-mon/templates/configmap-bin.yaml
@ -54,6 +54,12 @@ data:
  moncheck-reap-zombies.py: |
 {{ tuple "bin/moncheck/_reap-zombies.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}

+  utils-checkPGs.py: |
+{{ tuple "bin/utils/_checkPGs.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
+
+  utils-checkPGs.sh: |
+{{ tuple "bin/utils/_checkPGs.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
+
 {{ if .Values.logging.fluentd }}
  fluentbit-sidecar.sh: |
 {{ tuple "bin/mon/_fluentbit-sidecar.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
--- a/ceph-mon/templates/cronjob-checkPGs.yaml
+++ b/ceph-mon/templates/cronjob-checkPGs.yaml
@ -0,0 +1,52 @@
+{{/*
+Copyright 2018 The Openstack-Helm Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/}}
+
+{{- if .Values.manifests.cronjob_checkPGs }}
+{{- $envAll := . }}
+
+{{- $serviceAccountName := "ceph-pool-checkpgs" }}
+{{ tuple $envAll "pool_checkpgs" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
+---
+apiVersion: batch/v1beta1
+kind: CronJob
+metadata:
+  name: {{ $serviceAccountName }}
+spec:
+  schedule: {{ .Values.jobs.pool_checkPGs.cron | quote }}
+  successfulJobsHistoryLimit: {{ .Values.jobs.pool_checkPGs.history.successJob }}
+  failedJobsHistoryLimit: {{ .Values.jobs.pool_checkPGs.history.failJob }}
+  concurrencyPolicy: {{ .Values.jobs.pool_checkPGs.concurrency.execPolicy }}
+  startingDeadlineSeconds: {{ .Values.jobs.pool_checkPGs.startingDeadlineSecs }}
+  jobTemplate:
+    metadata:
+      labels:
+{{ tuple $envAll "ceph" "pool-checkpgs" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 8 }}
+    spec:
+      template:
+        spec:
+          containers:
+          - name: {{ $serviceAccountName }}
+{{ tuple $envAll "ceph_config_helper" | include "helm-toolkit.snippets.image" | indent 12 }}
+            env:
+              - name: DEPLOYMENT_NAMESPACE
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.namespace
+            command:
+              - /tmp/utils-checkPGs.sh
+          restartPolicy: Never
+
+{{- end }}
--- a/ceph-mon/templates/daemonset-mon.yaml
+++ b/ceph-mon/templates/daemonset-mon.yaml
@ -156,6 +156,14 @@ spec:
              mountPath: /tmp/mon-check.sh
              subPath: mon-check.sh
              readOnly: true
+            - name: ceph-mon-bin
+              mountPath: /tmp/utils-checkPGs.py
+              subPath: utils-checkPGs.py
+              readOnly: true
+            - name: ceph-mon-bin
+              mountPath: /tmp/utils-checkPGs.sh
+              subPath: utils-checkPGs.sh
+              readOnly: true
            - name: ceph-mon-etc
              mountPath: /etc/ceph/ceph.conf
              subPath: ceph.conf
--- a/ceph-mon/values.yaml
+++ b/ceph-mon/values.yaml
@ -113,6 +113,20 @@ network:
  public: 192.168.0.0/16
  cluster: 192.168.0.0/16

+jobs:
+  pool_checkPGs:
+    # Execute monthly on the 1st at 00:01 AM
+    cron: "1 0 1 * *"
+    history:
+      # Number of successful job to keep
+      successJob: 1
+      # Number of failed job to keep
+      failJob: 1
+    concurrency:
+      # Skip new job if previous job still active
+      execPolicy: Forbid
+    startingDeadlineSecs: 60
+
 conf:
  templates:
    keyring:
@ -319,3 +333,4 @@ manifests:
  service_mon: true
  service_mon_discovery: true
  job_storage_admin_keys: true
+  cronjob_checkPGs: true
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@ -7,6 +7,7 @@ Contents:
   :maxdepth: 2

   install/index
+   testing/index


 Indices and Tables
--- a/doc/source/testing/ceph-resiliency/README.rst
+++ b/doc/source/testing/ceph-resiliency/README.rst
@ -0,0 +1,21 @@
+==============================================
+Resiliency Tests for OpenStack-Helm-Infra/Ceph
+==============================================
+
+Mission
+=======
+
+The goal of our resiliency tests for `OpenStack-Helm-Infra/Ceph
+<https://github.com/openstack/openstack-helm-infra/tree/master/ceph>`_ is to
+show symptoms of software/hardware failure and provide the solutions.
+
+Caveats:
+   - Our focus lies on resiliency for various failure scenarios but
+     not on performance or stress testing.
+
+Software Failure
+================
+* `CRUSH Failure Domain <./failure-domain.html>`_
+
+Hardware Failure
+================
--- a/doc/source/testing/ceph-resiliency/failure-domain.rst
+++ b/doc/source/testing/ceph-resiliency/failure-domain.rst
--- a/doc/source/testing/ceph-resiliency/index.rst
+++ b/doc/source/testing/ceph-resiliency/index.rst
@ -0,0 +1,9 @@
+===============
+Ceph Resiliency
+===============
+
+.. toctree::
+   :maxdepth: 2
+
+   README
+   failure-domain
--- a/doc/source/testing/index.rst
+++ b/doc/source/testing/index.rst
@ -0,0 +1,8 @@
+=======
+Testing
+=======
+
+.. toctree::
+   :maxdepth: 2
+
+   ceph-resiliency/index