若是一个硬盘故障致使osd节点出现以下的down状态,且一直没法恢复( reweight列等于0,表示osd已经out此集群)node
[root@os-node3 ~]# ceph osd tree ide
# id weight type name up/down reweightrem
-1 4 root defaultit
-2 1 host os-node5class
0 1 osd.24 down 0集群
经过命令得到down状态的osd的IDawk
osd_id =` ceph osd tree | grep down | grep osd | awk '{print $3}' | awk -F . '{print $2}`date
1)、在集群中删除一个osd硬盘file
[root@PBS-OS-node155 ~]# ceph osd rm 24map
removed osd.24
2)、在集群中删除一个osd 硬盘 crush map
[root@PBS-OS-node155 ~]# ceph osd crush rm osd.24
removed item id 24 name 'osd.24' from crush map
3)、删除此osd在ceph集群中的认证
[root@PBS-OS-node155 ~]# ceph auth del osd.24
updated
4)、卸载osd所挂载的硬盘
[root@PBS-OS-node155 ~]# umount /var/lib/ceph/osd/ceph-24
摘掉osd的脚本以下
osd_id=`ceph osd tree | grep down | grep osd | awk '{print $3}' | awk -F . '{print $2}'`
ceph osd rm ${osd_id}
ceph osd crush rm osd.${osd_id}
ceph auth del osd.${osd_id}
umount /var/lib/ceph/osd/ceph-${osd_id}
更换完硬盘后再把此硬盘从新加入集群osd
osd_id=`ceph osd create`
mkfs.xfs -f /dev/sdf
mount /dev/sdf /var/lib/ceph/osd/ceph-${osd_id}
mount -o remount,user_xattr /var/lib/ceph/osd/ceph-${osd_id}
ceph-osd -i ${osd_id} --mkfs --mkkey
ceph auth add osd.${osd_id} osd 'allow *' mon 'allow profile osd' -i /var/lib/ceph/osd/ceph-${osd_id}/keyring
touch /var/lib/ceph/osd/ceph-${osd_id}/sysvinit
/etc/init.d/ceph start osd.${osd_id}