title: 8.2.k8s集群监控
order: 44
icon: lightbulb
一、环境
主机名 | IP地址 | 系统 | 说明 |
k8s | 192.168.11.65 | Ubuntu 20.04 | k8s版本:v1.23.10 单机版本 |
准备环境
通过克隆的方式准备了一台全新的Ubuntu 20.04服务器(4核8g)
1、设置主机名为k8s
sudo hostnamectl set-hostname k8s
2、使用KubeKey安装k8s
系统要求
使用kubekey安装k8s要求的系统及版本
Ubuntu 16.04, 18.04, 20.04, 22.04
Debian Buster, Stretch
CentOS/RHEL 7
AlmaLinux 9.0
SUSE Linux Enterprise Server 15
安装k8s单机版
- 因为我是测试,所以安装k8s单机版,生产一般建议最少3台
#下载KubeKey
export KKZONE=cn
curl -sfL https://get-kk.kubesphere.io | VERSION=v3.0.7 sh -
#创建配置文件
./kk create config --name linge
#修改配置文件。linge是上步指定的名称
vim config-linge.yaml
#安装依赖
apt -y install socat conntrack
#安装k8s命令
./kk create cluster -f config-linge.yaml
#检查命令
kubectl get pod -A
安装完成后检查,全部running表示正常
在 Kubernetes 集群中创建一个 pod, 验证是否正常运行
#创建nginx容器
kubectl create deployment nginx --image=nginx
#暴露nginx端口
kubectl expose deployment nginx --port=80 --type=NodePort
# 查看pod以及服务
kubectl get pod,svc
http://192.168.11.65:nodeport/
安装k8s单机版配置
root@k8s:~# cat config-linge.yaml
apiVersion: kubekey.kubesphere.io/v1alpha2
kind: Cluster
metadata:
name: linge
spec:
hosts:
- {name: k8s, address: 192.168.11.65, internalAddress: 192.168.11.65, user: linge, password: "xxx"}
roleGroups:
etcd:
- k8s
control-plane:
- k8s
worker:
- k8s
controlPlaneEndpoint:
## Internal loadbalancer for apiservers
# internalLoadbalancer: haproxy
domain: lb.kubesphere.local
address: ""
port: 6443
kubernetes:
#指定k8s版本
version: v1.23.10
clusterName: cluster.local
autoRenewCerts: true
containerManager: docker
etcd:
type: kubekey
network:
plugin: calico
kubePodsCIDR: 10.233.64.0/18
kubeServiceCIDR: 10.233.0.0/18
## multus support. https://github.com/k8snetworkplumbingwg/multus-cni
multusCNI:
enabled: false
registry:
privateRegistry: ""
namespaceOverride: ""
registryMirrors: []
insecureRegistries: []
addons: []
安装截图
添加新节点
#编辑之前通过./kk创建的配置文件,
vim config-linge.yaml
#添加如下配置,注意192.168.11.67为新node的ip,name为计算机名
spec:
hosts:
- {name: node2, address: 192.168.11.67, internalAddress: 192.168.11.67, user: linge, password: "cdring"}
#执行添加命令
./kk add nodes -f config-sample.yaml
#等待自动完成
二、在K8S中安装Prometheus Operator
是一个由CoreOS 团队开发的开源Prometheus 工具集,它将Prometheus、Grafana、Alertmanager、Service Discovery 和其他监控相关的组件打包在一起,方便在Kubernetes 环境中部署和使用
prometheus-operator GitHub地址
通过kubectl create安装
kube-prometheus GitHub地址
通过kubectl create安装
kube-prometheus-stack GitHub地址
通过helm安装
1、安装kube-prometheus-stack
通过添加prometheus-community仓库把kube-prometheus-stack包下载下来,需要“梯子”
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
#把包下载下来
helm fetch prometheus-community/kube-prometheus-stack
tar xf kube-prometheus-stack-45.8.0.tgz
通过wget下载
wget https://github.com/prometheus-community/helm-charts/releases/download/kube-prometheus-stack-45.8.0/kube-prometheus-stack-45.8.0.tgz
tar xf kube-prometheus-stack-45.8.0.tgz
修改镜像源
国外镜像源某些镜像无法拉取,我们这里修改prometheus-operator,prometheus,alertmanager,kube-state-metrics,node-exporter的镜像源为国内镜像源。我这里使用的是中科大的镜像源。
#检查
grep -A 2 'image:' kube-prometheus-stack/*
#批量替换
sed -i 's/quay.io/quay.mirrors.ustc.edu.cn/g' `grep "quay.io" -rl kube-prometheus-stack/*`
docker pull registry.aliyuncs.com/google_containers/kube-webhook-certgen:v20221220-controller-v1.5.1-58-g787ea74b6
docker tag registry.aliyuncs.com/google_containers/kube-webhook-certgen:v20221220-controller-v1.5.1-58-g787ea74b6 registry.k8s.io/ingress-nginx/kube-webhook-certgen:v20221220-controller-v1.5.1-58-g787ea74b6
docker pull bitnami/kube-state-metrics:2.8.2
docker tag bitnami/kube-state-metrics:2.8.2 registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.8.2
修改配置文件
- 修改grafana的密码
vim kube-prometheus-stack/values.yaml
grafana:
adminPassword: 填写自己的密码
或:使用sed修改
sed -i 's#adminPassword: prom-operator#adminPassword: password#g' kube-prometheus-stack/values.yaml
本地安装
helm install -n monitoring --create-namespace prometheus kube-prometheus-stack
检查pod
root@k8s:~# kubectl get pod -n monitoring
NAME READY STATUS RESTARTS AGE
alertmanager-prometheus-kube-prometheus-alertmanager-0 2/2 Running 1 (16h ago) 16h
prometheus-blackbox-exporter-57576d69b9-tz4rc 1/1 Running 0 15h
prometheus-grafana-894b6bd64-jxg82 3/3 Running 0 16h
prometheus-kube-prometheus-operator-84cdc87db8-nmdsj 1/1 Running 0 16h
prometheus-kube-state-metrics-78685d656f-kt4vl 1/1 Running 0 16h
prometheus-mysql-exporter-84d44dd7d8-c65bs 1/1 Running 0 14m
prometheus-prometheus-kube-prometheus-prometheus-0 2/2 Running 0 16h
prometheus-prometheus-node-exporter-q2g5v 1/1 Running 0 16h
检查svc
root@k8s:~# kubectl get svc -n monitoring
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
prometheus-grafana ClusterIP 10.233.59.19 <none> 80/TCP 43h
prometheus-kube-prometheus-alertmanager ClusterIP 10.233.17.147 <none> 9093/TCP 43h 43h
prometheus-kube-prometheus-prometheus ClusterIP 10.233.0.59 <none> 9090/TCP 43h
检查deploy
root@k8s:~# kubectl get deploy -n monitoring
NAME READY UP-TO-DATE AVAILABLE AGE
prometheus-grafana 1/1 1 1 4h7m
prometheus-kube-prometheus-operator 1/1 1 1 4h7m
prometheus-kube-state-metrics 1/1 1 1 4h7m
配置外部访问地址
kubectl port-forward --address=0.0.0.0 svc/prometheus-kube-prometheus-prometheus -n monitoring 9090:9090 &
kubectl port-forward --address=0.0.0.0 svc/prometheus-kube-prometheus-alertmanager -n monitoring 9093:9093 &
kubectl port-forward --address=0.0.0.0 svc/prometheus-grafana -n monitoring 3000:80 &
查看四类自定义资源
Prometheus:声明式创建和管理Prometheus Server实例;
ServiceMonitor:负责声明式的管理监控配置;
PrometheusRule:负责声明式的管理告警配置;
Alertmanager:声明式的创建和管理Alertmanager实例。
查看Prometheus
kubectl get prometheus -n monitoring prometheus-kube-prometheus-prometheus -oyaml
podMonitorSelector:
matchLabels:
release: prometheus
#选择label为如下的probe
probeSelector:
matchLabels:
release: prometheus
#选择label为如下的PrometheusRule
ruleSelector:
matchLabels:
release: prometheus
#选择label为如下的serviceMonitor
serviceMonitorSelector:
matchLabels:
release: prometheus
matchLabels
用于定义一组Label
查看Alertmanager
kubectl get Alertmanager -n monitoring prometheus-kube-prometheus-alertmanager -oyaml
查看ServiceMonitor
kubectl get servicemonitors -n monitoring
查看PrometheusRule
kubectl get prometheusrules -n monitoring
碰到的问题
问题1:
监控kube-proxy有问题,如下图:
解决
编辑kube-proxy配置文件
kubectl edit cm/kube-proxy -n kube-system
把metricsBindAddress: ""
修改为metricsBindAddress: 0.0.0.0
...
kind: KubeProxyConfiguration
metricsBindAddress: 0.0.0.0
...
删除pod,重启kube-proxy
kubectl delete pod -l k8s-app=kube-proxy -n kube-system
重启后检查
kubectl get pod -l k8s-app=kube-proxy -n kube-system
http://192.168.11.65:9090/targets?search=
问题2:
描述:kube-controller-manager 和 kube-scheduler的监控也有问题,如下图:
解决:
在每台master节点执行
vim /etc/kubernetes/manifests/kube-scheduler.yaml
将
--bind-address=127.0.0.1
改为
--bind-address=0.0.0.0
```
vim /etc/kubernetes/manifests/kube-controller-manager.yaml
将
--bind-address=127.0.0.1
改为
--bind-address=0.0.0.0
重启kubelet
systemctl restart kubelet.service
问题3:
kubectl describe pod prometheus-kube-prometheus-admission-create-8jmhj -n monitoring
从registry.k8s.io下载ingress-nginx/kube-webhook-certgen下载失败
解决:
把registry.k8s.io仓库地址修改为中科大的仓库地址,修改方法见上面的“修改镜像源”
问题4:
从registry.k8s.io下载kube-state-metrics失败
kubectl describe pod prometheus-kube-state-metrics-78685d656f-sz66n -n monitoring
解决:
把registry.k8s.io仓库地址修改为中科大的仓库地址,修改方法见上面的“修改镜像源”
三、我的微信
如果碰到问题,可以随时加我微信,谢谢
评论区