Kubernetes集群维护
# Kubernetes集群维护
# Etcd数据备份与恢复
kubernetes使用Etcd数据库实时存储集群中的数据,安全起见,一定要备份。
# 安装etcd
[root@master etcd]# yum install -y etcd
[root@master ~]# etcdctl --version
etcdctl version: 3.3.11
API version: 2
# 备份etcd数据库
如果工具版本是3默认API版本是2就要换成3版本
[root@master etcd]# ETCDCTL_API=3 etcdctl \
snapshot save snap.db \ #备份,指定一个文件名,在线进行备份
--endpoints=https://127.0.0.1:2379 \ #连接ETCD的节点 使用ss -ntpl |grep 2379,需要查看网络插件是否使用宿主机的,访问带上ca证书校验
--cacert=/etc/kubernetes/pki/etcd/ca.crt \ #指定证书的公共目录下的etcd目录的ca.crt
--cert=/etc/kubernetes/pki/etcd/server.crt \ #指定数字证书
--key=/etc/kubernetes/pki/etcd/server.key #指定密钥
Snapshot saved at snap.db
[root@master ~]# du -sh snap.db #查看大小
4.2M snap.db
# 删除数据deployment
[root@master ~]# kubectl get deployment
NAME READY UP-TO-DATE AVAILABLE AGE
web 1/1 1 1 161m
[root@master ~]# kubectl delete deployment web
deployment.apps "web" deleted
[root@master ~]# kubectl get deployment
No resources found in default namespace.
可以看到当前是没有deployment的
# 恢复数据
- 先暂停kube-apiserver和etcd容器
- 相当于把之前的文件夹换个名字做备份测试
[root@master ~]# mv /etc/kubernetes/manifests /etc/kubernetes/manifests.bak
[root@master ~]# mv /var/lib/etcd/ /var/lib/etcd.back
# 恢复
[root@master ~]# ETCDCTL_API=3 etcdctl snapshot restore snap.db --data-dir=/var/lib/etcd
2021-11-08 22:46:35.934598 I | mvcc: restore compact to 19609
2021-11-08 22:46:35.941164 I | etcdserver/membership: added member 8e9e05c52164694d [http://localhost:2380] to cluster cdf818194e3a8c32
[root@master ~]# mv /etc/kubernetes/manifests.bak/ /etc/kubernetes/manifests
[root@master ~]# systemctl restart kubelet
# 查看数据恢复情况
- 这里需要稍等一会,然后等待Pods重新构建起来
- 查看deployment是否存在之前的数据
[root@master ~]# kubectl get pods -A
NAMESPACE NAME READY STATUS RESTARTS AGE
default web-96d5df5c8-w2vvf 1/1 Running 0 172m
ingress-nginx nginx-ingress-controller-gvfvl 1/1 Running 0 3h29m
kube-system calico-kube-controllers-659bd7879c-qv5qx 1/1 Running 6 4h2m
kube-system calico-node-df8kn 1/1 Running 0 4h2m
kube-system calico-node-xrmnz 1/1 Running 0 3h57m
kube-system client3 1/1 Running 0 79m
kube-system coredns-7f89b7bc75-jwxfg 1/1 Running 0 4h2m
kube-system coredns-7f89b7bc75-qf8dg 1/1 Running 0 4h2m
kube-system etcd-master 1/1 Running 0 4h2m
kube-system kube-apiserver-master 1/1 Running 0 4h2m
kube-system kube-controller-manager-master 1/1 Running 0 4h
kube-system kube-proxy-mbhtv 1/1 Running 1 4h2m
kube-system kube-proxy-n8qc8 1/1 Running 0 3h57m
kube-system kube-scheduler-master 1/1 Running 0 4h2m
root@master ~]# kubectl get deployment
NAME READY UP-TO-DATE AVAILABLE AGE
web 1/1 1 1 172m
# K8S集群版本升级
Kubernetes每隔3个月发布一个小版本。
# 升级策略:
始终保持最新
每半年升级一次,这样会落后社区1-2个版本
一年升级一次,或者更长,落后版本太多
# 升级的基本流程
- 升级管理节点 > 升级其他管理节点 > 升级工作节点
# 注意事项:
升级之前必须备份所有组件以及数据,例如etcd
千万不要跨多个小版本进行升级,例如从1.16-1.19
# Kubeadm对K8S集群进行版本升级
# 升级管理节点
# 查找最新版本号:
[root@master ~]# yum list --showduplicates kubeadm
# 升级kubeadm:
[root@master ~]# yum install -y kubeadm-1.20.1-0
# 驱逐node上的Pod,且不可调度:
[root@master ~]# kubectl drain master
node/master cordoned
error: unable to drain node "master", aborting command...
There are pending nodes to be drained:
master
error: cannot delete DaemonSet-managed Pods (use --ignore-daemonsets to ignore): kube-system/calico-node-df8kn, kube-system/kube-proxy-mbhtv
[root@master ~]# kubectl drain master --ignore-daemonsets
node/master already cordoned
node/master evicted
# 检查集群是否可以升级,并且获取可以升级的版本:
[root@master ~]# kubeadm upgrade plan
···
You can now apply the upgrade by executing the following command:
kubeadm upgrade apply v1.20.12 #将会提示你更新到可用的最新版本
Note: Before you can perform this upgrade, you have to update kubeadm to v1.20.12.
# 执行升级:
[root@master ~]# kubeadm upgrade apply v1.20.1
[upgrade/config] Making sure the configuration is correct:
[upgrade/config] Reading configuration from the cluster...
[upgrade/config] FYI: You can look at this config file with 'kubectl -n kube-system get cm kubeadm-config -o yaml'
[preflight] Running pre-flight checks.
[upgrade] Running cluster health checks
[upgrade/version] You have chosen to change the cluster version to "v1.20.1"
[upgrade/versions] Cluster version: v1.20.0
[upgrade/versions] kubeadm version: v1.20.1
[upgrade/confirm] Are you sure you want to proceed with the upgrade? [y/N]: y
···
[upgrade/successful] SUCCESS! Your cluster was upgraded to "v1.20.1". Enjoy!
[upgrade/kubelet] Now that your control plane is upgraded, please proceed with upgrading your kubelets if you haven't already done so.
# 取消不可调度:
[root@master ~]# kubectl get nodes
NAME STATUS ROLES AGE VERSION
master Ready,SchedulingDisabled control-plane,master 4h45m v1.20.0
node Ready <none> 4h40m v1.20.0
[root@master ~]# kubectl uncordon master
node/master uncordoned
# 升级kubelet和kubectl:
[root@master ~]# yum install -y kubectl-1.20.1-0 kubelet-1.20.1-0
# 重启kubelet:
[root@master ~]# systemctl daemon-reload
[root@master ~]# systemctl restart kubelet
[root@master ~]# kubectl get nodes
NAME STATUS ROLES AGE VERSION
master Ready control-plane,master 4h49m v1.20.1
node Ready <none> 4h44m v1.20.0
# 升级工作节点
# 升级kubeadm:
[root@node ~]# yum install -y kubeadm-1.20.1-0
# 驱逐node上的Pod,且不可调度:
[root@master ~]# kubectl drain node --ignore-daemonsets --force
node/node already cordoned
node/node evicted
# 升级kubelet配置:
[root@master ~]# kubeadm upgrade node
# 升级kubelet和kubectl:
[root@node ~]# yum install -y kubectl=1.20.1-0 kubelet-1.20.1-0
# 重启kubelet:
[root@node ~]# systemctl daemon-reload
[root@node ~]# systemctl restart kubelet
# 取消不可调度:
[root@master ~]# kubectl get nodes
NAME STATUS ROLES AGE VERSION
master Ready control-plane,master 4h59m v1.20.1
node Ready,SchedulingDisabled <none> 4h54m v1.20.1
[root@master ~]# kubectl uncordon node
node/node uncordoned
[root@master ~]# kubectl get nodes
NAME STATUS ROLES AGE VERSION
master Ready control-plane,master 4h59m v1.20.1
node Ready <none> 4h54m v1.20.1
# K8s集群节点正确下线流程
如果你想维护或者删除某个节点,正确流程如下:
# 获取节点列表:
[root@master ~]# kubectl get nodes
NAME STATUS ROLES AGE VERSION
master Ready control-plane,master 5h3m v1.20.1
node Ready <none> 4h57m v1.20.1
# 设置不可调度:
[root@master ~]# kubectl cordon node
node/node cordoned
# 驱逐节点上的Pod:
[root@master ~]# kubectl drain node --ignore-daemonsets
node/node already cordoned
node/node evicted
# 移除节点:
[root@master ~]# kubectl delete node node
node "node" deleted
[root@master ~]# kubectl get node
NAME STATUS ROLES AGE VERSION
master Ready control-plane,master 5h4m v1.20.1
# 重新加入节点
# 使用kubeadm重置节点:
[root@node ~]# kubeadm reset
[reset] WARNING: Changes made to this host by 'kubeadm init' or 'kubeadm join' will be reverted.
[reset] Are you sure you want to proceed? [y/N]: y
[preflight] Running pre-flight checks
# 在master节点生成新的token:
[root@master ~]# kubeadm token create
mmvxlw.lgpb891ml8g7q9oq
[root@master ~]# openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | sed 's/^.* //'
0026319b035eef72f40ba1545358f75113756c7b97e9925f4dd3105a01fbdb01
# node重新加入节点:
[root@node ~]# kubeadm join 192.168.1.18:6443 --token mmvxlw.lgpb891ml8g7q9oq --discovery-token-ca-cert-hash sha256:0026319b035eef72f40ba1545358f75113756c7b97e9925f4dd3105a01fbdb01
[preflight] Running pre-flight checks
[WARNING SystemVerification]: this Docker version is not on the list of validated versions: 20.10.10. Latest validated version: 19.03
[preflight] Reading configuration from the cluster...
[preflight] FYI: You can look at this config file with 'kubectl -n kube-system get cm kubeadm-config -o yaml'
[kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml"
[kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env"
[kubelet-start] Starting the kubelet
[kubelet-start] Waiting for the kubelet to perform the TLS Bootstrap...
This node has joined the cluster:
* Certificate signing request was sent to apiserver and a response was received.
* The Kubelet was informed of the new secure connection details.
Run 'kubectl get nodes' on the control-plane to see this node join the cluster.
# 查看节点列表:
[root@master ~]#
[root@master ~]# kubectl get nodes
NAME STATUS ROLES AGE VERSION
master Ready control-plane,master 5h13m v1.20.1
node Ready <none> 32s v1.20.1
上次更新: 2023/11/28, 22:03:59