Kubernetes监控与日志管理
# Kubernetes监控与日志管理
# 查看资源集群状态
# 查看master组件状态
[root@master ~]# kubectl get cs
NAME STATUS MESSAGE ERROR
controller-manager Healthy ok
scheduler Healthy ok
etcd-0 Healthy {"health":"true"}
# 查看node状态
[root@master ~]# kubectl get node
NAME STATUS ROLES AGE VERSION
master Ready master 20h v1.18.1
node Ready <none> 20h v1.18.1
# 查看Apiserver代理的URL
[root@master ~]# kubectl cluster-info
Kubernetes master is running at https://172.25.254.140:6443
KubeDNS is running at https://172.25.254.140:6443/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy
To further debug and diagnose cluster problems, use 'kubectl cluster-info dump'.
# 查看群详细信息
[root@master ~]# kubectl describe <资源> <名称>
# 查看资源信息
[root@master ~]# kubectl get pods
NAME READY STATUS RESTARTS AGE
pod-1-7cf5dd459b-fjpcb 0/1 ContainerCreating 0 5s
[root@master ~]# kubectl get pod pod-1-7cf5dd459b-fjpcb --watch "查看创建状态"
# 集群错误排查
当我们使用kubectl工具执行出现错误的时候?
kubectl > kube-apiserver > etcd
当get node的时候有节点显示NotReady状态,怎么去排查?
kubectl > runtime
# 集群不健康
当有一个节点出现不是健康的时候怎么办?
解决方法:
cd /etc/kubernetes/manifest
然后将你的scheduler以及controll manager.yaml中都port=0注释掉
containers:
- command:
- kube-scheduler
- --authentication-kubeconfig=/etc/kubernetes/scheduler.conf
- --authorization-kubeconfig=/etc/kubernetes/scheduler.conf
- --bind-address=127.0.0.1
- --kubeconfig=/etc/kubernetes/scheduler.conf
- --leader-elect=true
# - --port=0
image: k8s.gcr.io/kube-sc
[root@master ~]# systemctl restart kubelet
[root@master ~]# kubectl get cs
Warning: v1 ComponentStatus is deprecated in v1.19+
NAME STATUS MESSAGE ERROR
scheduler Healthy ok
controller-manager Healthy ok
etcd-0 Healthy {"health":"true","reason":""}
# 监控集群资源利用率
# 监控数据聚合器
kubectl > apiserver > metrics-server > kubelet(cadviisor)(https)
# Metrics Server部署:
[root@master ~]# wget https://github.com/kubernetes-sigs/metrics-server/releases/download/v0.3.7/components.yaml
[root@master ~]# vim components.yaml
...
containers:
- name: metrics-server
image: lizhenliang/metrics-server:v0.3.7 #修改镜像
imagePullPolicy: IfNotPresent
args:
- --cert-dir=/tmp
- --secure-port=4443
- --kubelet-insecure-tls #不验证kubelet提供的https证书
- --kubelet-preferred-address-types=InternalIP #使用节点IP连接kubelet
...
创建yaml文件生效
[root@master ~]# kubectl apply -f components.yaml
# 如果提示报错:
[root@master ~]# kubectl apply -f metrics-server.yaml
·
error: unable to recognize "metrics-server.yaml": no matches for kind "APIService" in version "apiregistration.k8s.io/v1beta1"
修改yaml文件
---
apiVersion: apiregistration.k8s.io/v1 #去掉bate
kind: APIService
metadata:
name: v1beta1.metrics.k8s.io
执行yaml
[root@master ~]# kubectl apply -f metrics-server.yaml
clusterrole.rbac.authorization.k8s.io/system:aggregated-metrics-reader unchanged
clusterrolebinding.rbac.authorization.k8s.io/metrics-server:system:auth-delegator unchanged
rolebinding.rbac.authorization.k8s.io/metrics-server-auth-reader unchanged
apiservice.apiregistration.k8s.io/v1beta1.metrics.k8s.io created
serviceaccount/metrics-server unchanged
deployment.apps/metrics-server unchanged
service/metrics-server unchanged
clusterrole.rbac.authorization.k8s.io/system:metrics-server unchanged
clusterrolebinding.rbac.authorization.k8s.io/system:metrics-server unchanged
# 查看Node资源消耗
kubectl top node <node name>
[root@master ~]# kubectl top node
NAME CPU(cores) CPU% MEMORY(bytes) MEMORY%
master 146m 7% 1853Mi 15%
node 61m 3% 945Mi 12%
[root@master ~]#
[root@master ~]# kubectl top node master
NAME CPU(cores) CPU% MEMORY(bytes) MEMORY%
master 146m 7% 1853Mi 15%
# 查看Pod资源消耗
kubectl top pod <pod name>
[root@master ~]# kubectl top pod calico-node-9w9tt -n kube-system
NAME CPU(cores) MEMORY(bytes)
calico-node-9w9tt 20m 113Mi
# 管理K8s应用日志
kubectl logs <Pod名称>
kubectl logs -f <Pod名称>
kubectl logs -f <Pod名称> -c <容器名称>
k8s查看标准输出日志流程
kubectl logs > apiserver > kubelet > docker(接管了容器标准输出日志并写到了/var/lib/docker/containers/<container_id>/<container_id>-json)
[root@master ~]# cd /var/lib/docker/containers/0852da4a53a2ccbeff729fc48d6d3e96915f80e1ffcd0bd65240577d5a88176c
[root@master 0852da4a53a2ccbeff729fc48d6d3e96915f80e1ffcd0bd65240577d5a88176c]# ll
-rw-r----- 1 root root 108 Oct 20 14:23 0852da4a53a2ccbeff729fc48d6d3e96915f80e1ffcd0bd65240577d5a88176c-json.log
drwx------ 2 root root 6 Oct 20 14:22 checkpoints
-rw------- 1 root root 3177 Oct 20 14:24 config.v2.json
-rw-r--r-- 1 root root 1553 Oct 20 14:24 hostconfig.json
-rw-r--r-- 1 root root 7 Oct 20 14:22 hostname
-rw-r--r-- 1 root root 201 Oct 20 14:22 hosts
drwx--x--- 3 root root 17 Oct 20 14:22 mounts
-rw-r--r-- 1 root root 114 Oct 20 14:22 resolv.conf
进入所有容器终端
[root@master ~]# kubectl exec -it web-96d5df5c8-vwwlv -- bash
root@web-96d5df5c8-vwwlv:~# cd /var/log/nginx/
root@web-96d5df5c8-vwwlv:/var/log/nginx# ls -l
total 0
lrwxrwxrwx 1 root root 11 Oct 12 02:03 access.log -> /dev/stdout
lrwxrwxrwx 1 root root 11 Oct 12 02:03 error.log -> /dev/stderr
容器中应用日志可以使用emptyDir数据卷将日志文件持久化到宿主机上。
宿主机的路径:
/var/lib/kubelet/pods/<pod-id>/volumes/kubernetes.io~empty-dir/logs/access.log
# Pod创建一个边车容器读取业务容器日志
[root@master ~]# cat pod.yaml
apiVersion: v1
kind: Pod
metadata:
name: my-pod
namespace: pod
spec:
containers:
- name: web
image: lizhenliang/nginx-php
volumeMounts:
- name: logs
mountPath: /usr/local/nginx/logs
- name: log
image: busybox
args: [/bin/sh, -c, 'tail -f /opt/access.log']
volumeMounts:
- name: logs
mountPath: /opt
volumes:
- name: logs
emptyDir: {}
执行yaml并且检测
[root@master ~]# kubectl apply -f pod.yaml
pod/my-pod configured
[root@master ~]# kubectl logs my-pod web
Starting php-fpm done
nginx: the configuration file /usr/local/nginx/conf/nginx.conf syntax is ok
nginx: configuration file /usr/local/nginx/conf/nginx.conf test is successful
# 收集K8s日志思路
日志平台搭建哪个技术用的多?
- ELK:重量级 Elasticsearch+Logstash(日志采集器,使用go写了filebeat替代日志采集功能)+Kibana
- Graylog、Loki:轻量级
- 官方推荐的EFK日志系统是啥? Elasticsearch+Flunend(日志采集器)+Kibana
上次更新: 2023/11/28, 22:03:59