侧边栏壁纸
博主头像
一揽芳华 博主等级

行动起来,活在当下

  • 累计撰写 265 篇文章
  • 累计创建 24 个标签
  • 累计收到 4 条评论

目 录CONTENT

文章目录

Prometheus实战

芳华是个男孩!
2024-10-11 / 0 评论 / 2 点赞 / 63 阅读 / 0 字
广告 广告

配置Prometheus+node_exporter+alertmanager+grafana+PrometheusAlert实战

1、服务器信息

主机信息

root@prometheus:~# hostnamectl 
 Static hostname: prometheus
       Icon name: computer-vm
         Chassis: vm
      Machine ID: 0611b7b27b2f478b9c47fc8d272b9117
         Boot ID: 57c175486ff146dda39708b24f6b7bec
  Virtualization: kvm
Operating System: Ubuntu 22.04.4 LTS               
          Kernel: Linux 5.15.0-119-generic
    Architecture: x86-64
 Hardware Vendor: QEMU
  Hardware Model: Standard PC _i440FX + PIIX, 1996_

Prometheus所在目录及权限

root@prometheus:~# ll /prometheus
total 4
drwxr-xr-x 10 prometheus prometheus  174 Sep  6 13:58 ./
drwxr-xr-x 25 root       root       4096 Sep  6 10:23 ../
drwxr-xr-x  7 prometheus prometheus  126 Sep  6 09:55 PrometheusAlert/
drwxr-xr-x  3 prometheus prometheus  133 Sep  6 09:55 alertmanager/
drwxr-xr-x  3 prometheus prometheus  172 Sep  6 09:55 app/
drwxr-xr-x 12 prometheus prometheus  240 Sep  6 09:55 grafana/
drwxr-xr-x  2 root       root          6 Sep  6 13:58 node_collect/
drwxr-xr-x  2 prometheus prometheus   56 Sep  6 09:55 node_exporter/
drwxr-xr-x  7 prometheus prometheus  235 Sep  9 16:01 prometheus/

2、软件版本


-rw-r--r--  1 prometheus prometheus  30866868 Sep  6 09:55 alertmanager-0.27.0.linux-amd64.tar.gz           #触发告警
drwxr-xr-x  7 prometheus prometheus 135768146 Sep  2 14:34 grafana-enterprise-11.2.0.linux-amd64.tar.gz   #图标展示
-rw-r--r--  1 prometheus prometheus  31273622 Sep  6 09:55 PrometheusAlert-4.8.0.zip                    #告警模版
-rw-r--r--  1 prometheus prometheus  10676343 Sep  6 09:55 node_exporter-1.8.2.linux-amd64.tar.gz         #节点监控
-rw-r--r--  1 prometheus prometheus 105689699 Sep  6 09:55 prometheus-2.54.1.linux-amd64.tar.gz             #Prometheus主服务

3、安装软件

3.1、创建安装目录

root@prometheus:~# mkdir /prometheus/ 

3.2、配置启动用户

root@prometheus:~# useradd -M -s /usr/sbin/nologin prometheus

3.3、安装Prometheus

点击下载二进制包

安装Prometheus

root@prometheus:~# wegt https://github.com/prometheus/prometheus/releases/download/v2.54.1/prometheus-2.54.1.linux-amd64.tar.gz
root@prometheus:~# tar -xvf prometheus-2.54.1.linux-amd64.tar.gz -C /prometheus/
root@prometheus:~# mv /prometheus/prometheus-2.54.1.linux-amd64   /prometheus/prometheus
root@prometheus:~# chown prometheus:prometheus -R /prometheus/

安装service

root@prometheus:~# cat <<EOF> /etc/systemd/system/prometheus.service 
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network-online.target
​
[Service]
Type=simple
User=prometheus
Group=prometheus
Restart=on-failure
ExecStart=/prometheus/prometheus/prometheus \
  --config.file=/prometheus/prometheus/prometheus.yml \
  --storage.tsdb.path=/prometheus/prometheus/data \
  --storage.tsdb.retention.time=60d \
  --web.enable-lifecycle
​
[Install]
WantedBy=multi-user.target
EOF

启动Prometheus

root@prometheus:~# systemctl enable --now prometheus.service
root@prometheus:~# systemctl status prometheus.service

浏览器测试,端口:9090

3.4、安装alertmanager

点击下载二进制包

安装alertmanager

root@prometheus:~# wget https://github.com/prometheus/alertmanager/releases/download/v0.27.0/alertmanager-0.27.0.linux-amd64.tar.gz
root@prometheus:~# tar -xvf alertmanager-0.27.0.linux-amd64.tar.gz -C /prometheus/
root@prometheus:~# mv /prometheus/alertmanager-0.27.0.linux-amd64 /prometheus/alertmanager
root@prometheus:~# chown prometheus:prometheus -R /prometheus/

安装service

root@prometheus:~# cat <<EOF> /etc/systemd/system/alertmanager.service 

[Unit]
Description=Alert Manager
Wants=network-online.target
After=network-online.target

[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/prometheus/alertmanager/alertmanager \
  --config.file=/prometheus/alertmanager/alertmanager.yml \
  --storage.path=/prometheus/alertmanager/data

Restart=always

[Install]
WantedBy=multi-user.target
EOF

启动alertmanager

root@prometheus:~# systemctl enable --now alertmanager.service
root@prometheus:~# systemctl status alertmanager.service

浏览器测试,端口:9093

https://picture-1257845304.cos.ap-chengdu.myqcloud.com/202409091753586.png

3.5、安装grafana

官网地址:Download Grafana | Grafana Labs

安装granfana

root@prometheus:~# wget https://dl.grafana.com/enterprise/release/grafana-enterprise-11.2.0.linux-amd64.tar.gz
root@prometheus:~# tar -xvf grafana-enterprise-11.2.0.linux-amd64.tar.gz -C /prometheus/
root@prometheus:~# mv /prometheus/grafana-v11.2.0 /prometheus/grafana
root@prometheus:~# chown prometheus:prometheus -R /prometheus/

安装service

root@prometheus:~# cat <<EOF> /etc/systemd/system/grafana-server.service 
[Unit]
Description=Grafana server
Documentation=http://docs.grafana.org
[Service]
Type=simple
User=prometheus
Group=prometheus
Restart=on-failure
ExecStart=/prometheus/grafana/bin/grafana-server \
  --config=/prometheus/grafana/conf/defaults.ini \
  --homepath=/prometheus/grafana
[Install]
WantedBy=multi-user.target
root@prometheus:~# 
EOF

启动grafana

root@prometheus:~# systemctl enable --now grafana-server.service
root@prometheus:~# systemctl status grafana-server.service

****

浏览器测试,端口:3000

image-20240909180100015

3.6、安装PrometheusAlert

点击下载二进制包

root@prometheus:~# wget https://github.com/feiyu563/PrometheusAlert/releases/download/v4.9.1/linux.zip
root@prometheus:~# unzip linux.zip
root@prometheus:~# mv linux/  PrometheusAlert/
root@prometheus:~# mv PrometheusAlert/ /prometheus/PrometheusAlert/
root@prometheus:~# chown prometheus:prometheus -R /prometheus/

配置启动文件

root@prometheus:~# cat <<EOF> /etc/systemd/system/PrometheusAlert.service                                    
[Unit]
Description=prometheus-alert
Documentation=https://github.com/feiyu563/PrometheusAlert

[Service]
User=prometheus
Group=prometheus
Restart=on-failure
WorkingDirectory=/prometheus/PrometheusAlert
ExecStart=/prometheus/PrometheusAlert/PrometheusAlert

[Install]
WantedBy=multi-user.target
root@prometheus:~# 
EOF

启动

root@prometheus:~# systemctl enable --now PrometheusAlert.service   
root@prometheus:~# systemctl status PrometheusAlert.service   

浏览器测试,端口:8080,默认用户名密码:prometheusalert

image-20240909181211764

4、配置文件

4.1、添加node节点

点击下载二进制包

安装node_exporter

所有被监控节点都要安装

root@prometheus:~# wget https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz
root@prometheus:~# tar -xvf node_exporter-1.8.2.linux-amd64.tar.gz -C /prometheus/
root@prometheus:~# mv  /prometheus/node_exporter-1.8.2.linux-amd64  /prometheus/node_exporter
root@prometheus:~# chown prometheus:prometheus -R /prometheus/

配置启动文件

root@prometheus:~# cat <<EOF> /etc/systemd/system/node_exporter.service 
[Unit]
Description=node_exporter
Documentation=https://prometheus.io/
After=network.target
[Service]
User=prometheus
Group=prometheus
ExecStart=/prometheus/node_exporter/node_exporter
Restart=on-failure
[Install]
WantedBy=multi-user.target
root@prometheus:~# 
EOF

启动

root@prometheus:~# systemctl enable --now node_exporter.service
root@prometheus:~# systemctl status node_exporter.service

浏览器测试,端口:9100

4.2、修改Prometheus配置文件

root@prometheus:~# mkdir /prometheus/prometheus/conf.d/ -p    #定义子文件路径
root@prometheus:~# cat /prometheus/prometheus/prometheus.yml
# my global config
global:
  scrape_interval: 15s
  evaluation_interval: 15s

# Alertmanager configuration   # Alertmanager配置
alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - 192.168.11.4:9093   

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "alert.yml"     # 配置触发规则
  - "rules/*.yml"   # 添加其他触发规则文件路径,只要在./rules目录下的都可以成为触发规则。可在该目录下定义多个。

# A single scrape_configs block
scrape_configs:
  # Scrape Prometheus itself
  - job_name: "prometheus"  #Prometheus主服务器信息
    static_configs:
      - targets: ["192.168.11.4:9090"]  #Prometheus主服务器ip

  # Scrape Alertmanager
  - job_name: 'alertmanager'  #alertmanager主服务器信息
    scrape_interval: 3s
    static_configs:
      - targets: ['192.168.11.4:9093']  #alertmanager主服务器ip

  # Scrape physical nodes     #添加一个子文件,记录node节点信息,可以添加多个子文件
  - job_name: 'physical_nodes'  # job名字,可以中文
    file_sd_configs:
      - files:
        - '/prometheus/prometheus/conf.d/physical_nodes.yml'        #子文件路径,绝对路径

root@prometheus:/prometheus/prometheus/conf.d# cat physical_nodes.yml 
---
# prometheus/conf.d/physical_nodes.yml 
# 注意格式,添加对应的节点信息即可
- targets: ['192.168.11.4:9100']
  labels:
    instance: '192.168.11.4'
    role: 'Prometheus服务器'

- targets: ['192.168.11.241:9100']
  labels:
    instance: '192.168.11.241'
    role: 'PVE服务器241'

- targets: ['192.168.11.242:9100']
  labels:
    instance: '192.168.11.242'
    role: 'PVE服务器242'

4.3、创建报警规则

# 添加node节点的告警规则
root@prometheus:/prometheus/prometheus# pwd
/prometheus/prometheus

root@prometheus:/prometheus/prometheus# cat alert.yml
groups:
- name: 节点状态
  rules:
  # 对任何实例超过30秒无法联系的情况发出警报
  - alert: 服务告警
    expr: up == 0
    for: 10s
    labels:
      severity: critical
    annotations:
      summary: "服务异常, 实例: {{ $labels.instance }}"
      description: "{{ $labels.job }} 服务已关闭"

#--------------节点资源监控------------------------
- name: 节点资源
  rules:
  - alert: 内存使用检测
    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "主机内存不足,实例:{{ $labels.instance }}"
      description: "内存可用率<10%,当前值:{{ $value }}"
  - alert: 内存压力检测
    expr: rate(node_vmstat_pgmajfault[1m]) > 1000
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "内存压力不足,实例:{{ $labels.instance }}"
      description: "节点内存压力大。 重大页面错误率高,当前值为:{{ $value }}"
  - alert: 异常流入网络吞吐量检测
    expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "异常流入网络吞吐量,实例:{{ $labels.instance }}"
      description: "网络流入流量 > 100 MB/s,当前值:{{ $value }}"
  - alert: 异常流出网络吞吐量检测
    expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "异常流出网络吞吐量,实例:{{ $labels.instance }}"
      description: "网络流出流量 > 100 MB/s,当前值为:{{ $value }}"
  - alert: 磁盘读取异常检测
    expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "异常磁盘读取,实例:{{ $labels.instance }}"
      description: "磁盘读取> 50 MB/s,当前值:{{ $value }}"
  - alert: 磁盘写入异常检测
    expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "异常磁盘写入,实例:{{ $labels.instance }}"
      description: "磁盘写入> 50 MB/s,当前值:{{ $value }}"
  - alert: 磁盘空间使用检测
    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "磁盘空间不足告警,实例:{{ $labels.instance }}"
      description: "剩余磁盘空间< 10% ,当前值:{{ $value }}"
  - alert: 24小时内磁盘空间耗进检测
    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "磁盘空间将在24小时内耗尽,实例:{{ $labels.instance }}"
      description: "以当前写入速率预计磁盘空间将在 24 小时内耗尽,当前值:{{ $value }}"
  - alert: 磁盘Inodes不足检测
    expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint="/"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/"} == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "磁盘Inodes不足,实例:{{ $labels.instance }}"
      description: "剩余磁盘 inodes < 10%,当前值: {{ $value }}"
  - alert: 磁盘读取延迟检测
    expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "异常磁盘读取延迟,实例:{{ $labels.instance }}"
      description: "磁盘读取延迟 > 100ms,当前值:{{ $value }}"
  - alert: 磁盘写入延迟检测
    expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "异常磁盘写入延迟,实例:{{ $labels.instance }}"
      description: "磁盘写入延迟 > 100ms,当前值:{{ $value }}"
  - alert: CPU1分钟负载检测
    expr: node_load1 > 4
    for: 2m
    labels:
      severity: page
    annotations:
      summary: "CPU1分钟负载过高,实例:{{ $labels.instance }}"
      description: "CPU1分钟负载>4,已经持续2分钟。当前值为:{{ $value }}"
  - alert: CPU负载过高检测
    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "cpu负载高,实例:{{ $labels.instance }}"
      description: "cpu负载> 80%,当前值:{{ $value }}"
  - alert: CPU窃取异常检测
    expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: "CPU窃取率异常,实例:{{ $labels.instance }}"
      description: "CPU 窃取率 > 10%。 嘈杂的邻居正在扼杀 VM 性能,或者 Spot 实例可能失去信用,当前值:{{ $value }}"
  - alert: 磁盘swap空间使用异常检查
    expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "磁盘swap空间使用率异常,实例:{{ $labels.instance }}"
      description: "磁盘swap空间使用率>80%"
  - alert: 异常网络接受错误检测
    expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "异常网络接收错误,实例:{{ $labels.instance }}"
      description: "网卡{{ $labels.device }}在过去2分钟接收{{ $value }}个错误"
  - alert: 异常网络传输错误检查
    expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "异常网络传输错误,实例:{{ $labels.instance }}"
      description: "网卡{{ $labels.device }}在过去2分钟传输{{ $value }}个错误"
  - alert: 异常网络接口饱和检测
    expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 < 10000
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "异常网络接口饱和,实例:{{ $labels.instance }}"
      description: "网卡{{ $labels.device }}正在超载,当前值{{ $value }}"
  - alert: 异常连接数检测
    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "异常连接数,实例:{{ $labels.instance }}"
      description: "连接数过大,当前连接数:{{ $value }}"
  - alert: 异常时钟偏差检测
    expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "异常时钟偏差,实例:{{ $labels.instance }}"
      description: "检测到时钟偏差,时钟不同步。值为:{{ $value }}"
  - alert: 内核耗尽检测
    expr: node_filefd_allocated / node_filefd_maximum * 100 > 80
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "预计内核将很快耗尽文件描述符限制"
      description: "{{ $labels.instance }}}已分配的文件描述符数超过了限制的80%,当前值为:{{ $value }}"
      
      
# 可以添加多个规则,比如,这里针对docker的创建的对应的规则
root@prometheus:/prometheus/prometheus/rules# pwd
/prometheus/prometheus/rules

root@prometheus:/prometheus/prometheus/rules# cat docker.yml 
groups:
- name: Docker监控
  rules:
  - alert: 容器被杀
    expr: time() - container_last_seen > 300
    for: 5m
    labels:
      severity: warning
    annotations:
      isummary: "Docker容器被杀死 容器:{{ $labels.instance }}"
      description: "{{ $value }}个容器消失了超过5分钟"
  # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
  - alert: 容器不存在
    expr: absent(container_last_seen)
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "无容器 容器: {{ $labels.instance }}"
      description: "5分钟检查容器不存在,值为:{{ $value }}"
  - alert: 容器cpu使用率告警
    expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 300
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "容器cpu使用率告警 容器: {{ $labels.instance }}"
      description: "容器cpu使用率超过300%,当前值为:{{ $value }}"
  - alert: 容器内存使用率告警
    expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "容器内存使用率告警 容器: {{ $labels.instance }}"
      description: "容器内存使用率超过80%,当前值为:{{ $value }}"
  - alert: 容器存储io使用率告警
    expr: (sum(container_fs_io_current{name!=""}) BY (instance, name) * 100) > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "容器存储io使用率告警 容器: {{ $labels.instance }}"
      description: "容器存储io使用率超过 80%,当前值为:{{ $value }}"
  - alert: 容器限制告警
    expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "容器限制告警 容器:{{ $labels.instance }}"
      description: "容器被限制,当前值为:{{ $value }}"

4.4、配置 alertmanager配置文件

root@prometheus:/prometheus/alertmanager# pwd
/prometheus/alertmanager

root@prometheus:/prometheus/alertmanager# cat alertmanager.yml
global:
  resolve_timeout: 10s

route:
  receiver: "prometheusalert-wx"  # 根路由默认接收者为微信告警
  group_by: ['instance']
  group_wait: 5s
  group_interval: 5s
  repeat_interval: 5s

  # 定义多个接收者的子路由
  routes:
    - receiver: 'prometheusalert-wx'  # 发送到微信机器人
      continue: true  # 继续处理下一个路由

receivers:
  # 微信机器人告警接收者
  - name: 'prometheusalert-wx'
    webhook_configs:
    - url: 'http://192.168.11.4:8080/prometheusalert?type=wx&tpl=prometheus-wx'       #这个地址是PrometheusAlert的地址后面模板用的是prometheus-wx
      send_resolved: true  # 告警结束后通知已恢复

4.5、配置PrometheusAlert

修改对应的企业微信部分,添加企业微信机器人的webhook地址,其他的不用管,需要用到什么就修改什么

root@prometheus:/prometheus/PrometheusAlert/conf# pwd
/prometheus/PrometheusAlert/conf


root@prometheus:/prometheus/PrometheusAlert/conf# cat app.conf 
#---------------------↓全局配置-----------------------
appname = PrometheusAlert
#登录用户名
login_user=root
#登录密码
login_password=0000
#监听地址
httpaddr = "0.0.0.0"
#监听端口
httpport = 8080
runmode = dev
#设置代理 proxy = http://123.123.123.123:8080
proxy =
#开启JSON请求
copyrequestbody = true
#告警消息标题
title=PrometheusAlert
#链接到告警平台地址
GraylogAlerturl=http://graylog.org
#钉钉告警 告警logo图标地址
logourl=https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/doc/alert-center.png
#钉钉告警 恢复logo图标地址
rlogourl=https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/doc/alert-center.png
#短信告警级别(等于3就进行短信告警) 告警级别定义 0 信息,1 警告,2 一般严重,3 严重,4 灾难
messagelevel=3
#电话告警级别(等于4就进行语音告警) 告警级别定义 0 信息,1 警告,2 一般严重,3 严重,4 灾难
phonecalllevel=4
#默认拨打号码(页面测试短信和电话功能需要配置此项)
defaultphone=xxxxxxxx
#故障恢复是否启用电话通知0为关闭,1为开启
phonecallresolved=0
#是否前台输出file or console
logtype=file
#日志文件路径
logpath=logs/prometheusalertcenter.log
#转换Prometheus,graylog告警消息的时区为CST时区(如默认已经是CST时区,请勿开启)
prometheus_cst_time=0
#数据库驱动,支持sqlite3,mysql,postgres如使用mysql或postgres,请开启db_host,db_port,db_user,db_password,db_name的注释
db_driver=sqlite3
#db_host=127.0.0.1
#db_port=3306
#db_user=root
#db_password=root
#db_name=prometheusalert
#是否开启告警记录 0为关闭,1为开启
AlertRecord=1
#是否开启告警记录定时删除 0为关闭,1为开启
RecordLive=0
#告警记录定时删除周期,单位天
RecordLiveDay=7
# 是否将告警记录写入es7,0为关闭,1为开启
alert_to_es=0
# es地址,是[]string
# beego.Appconfig.Strings读取配置为[]string,使用";"而不是","
to_es_url=http://localhost:9200
# to_es_url=http://es1:9200;http://es2:9200;http://es3:9200
# es用户和密码
# to_es_user=username
# to_es_pwd=password
# 长连接最大空闲数
maxIdleConns=100
# 热更新配置文件
open-hotreload=0

#---------------------↓webhook-----------------------
#是否开启钉钉告警通道,可同时开始多个通道0为关闭,1为开启
open-dingding=1
#默认钉钉机器人地址
ddurl=https://oapi.dingtalk.com/robot/send?access_token=xxxxx
#是否开启 @所有人(0为关闭,1为开启)
dd_isatall=1
#是否开启钉钉机器人加签,0为关闭,1为开启
# 使用方法:https://oapi.dingtalk.com/robot/send?access_token=XXXXXX&secret=mysecret
open-dingding-secret=0

#是否开启微信告警通道,可同时开始多个通道0为关闭,1为开启
open-weixin=1
#默认企业微信机器人地址
open-workwechat=1
wxurl=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=55085e85-6c69-40e9-8e81-bfcee39f950e
wh_contenttype=application/json

#是否开启飞书告警通道,可同时开始多个通道0为关闭,1为开启
open-feishu=1
#默认飞书机器人地址
fsurl=https://open.feishu.cn/open-apis/bot/hook/xxxxxxxxx
# webhook 发送 http 请求的 contentType, 如 application/json, application/x-www-form-urlencoded,不配置默认 application/json
wh_contenttype=application/json

#---------------------↓腾讯云接口-----------------------
#是否开启腾讯云短信告警通道,可同时开始多个通道0为关闭,1为开启
open-txdx=0
#腾讯云短信接口key
TXY_DX_appkey=xxxxx
#腾讯云短信模版ID 腾讯云短信模版配置可参考 prometheus告警:{1}
TXY_DX_tpl_id=xxxxx
#腾讯云短信sdk app id
TXY_DX_sdkappid=xxxxx
#腾讯云短信签名 根据自己审核通过的签名来填写
TXY_DX_sign=腾讯云

#是否开启腾讯云电话告警通道,可同时开始多个通道0为关闭,1为开启
open-txdh=0
#腾讯云电话接口key
TXY_DH_phonecallappkey=xxxxx
#腾讯云电话模版ID
TXY_DH_phonecalltpl_id=xxxxx
#腾讯云电话sdk app id
TXY_DH_phonecallsdkappid=xxxxx

#---------------------↓华为云接口-----------------------
#是否开启华为云短信告警通道,可同时开始多个通道0为关闭,1为开启
open-hwdx=0
#华为云短信接口key
HWY_DX_APP_Key=xxxxxxxxxxxxxxxxxxxxxx
#华为云短信接口Secret
HWY_DX_APP_Secret=xxxxxxxxxxxxxxxxxxxxxx
#华为云APP接入地址(端口接口地址)
HWY_DX_APP_Url=https://rtcsms.cn-north-1.myhuaweicloud.com:10743
#华为云短信模板ID
HWY_DX_Templateid=xxxxxxxxxxxxxxxxxxxxxx
#华为云签名名称,必须是已审核通过的,与模板类型一致的签名名称,按照自己的实际签名填写
HWY_DX_Signature=华为云
#华为云签名通道号
HWY_DX_Sender=xxxxxxxxxx

#---------------------↓阿里云接口-----------------------
#是否开启阿里云短信告警通道,可同时开始多个通道0为关闭,1为开启
open-alydx=0
#阿里云短信主账号AccessKey的ID
ALY_DX_AccessKeyId=xxxxxxxxxxxxxxxxxxxxxx
#阿里云短信接口密钥
ALY_DX_AccessSecret=xxxxxxxxxxxxxxxxxxxxxx
#阿里云短信签名名称
ALY_DX_SignName=阿里云
#阿里云短信模板ID
ALY_DX_Template=xxxxxxxxxxxxxxxxxxxxxx

#是否开启阿里云电话告警通道,可同时开始多个通道0为关闭,1为开启
open-alydh=0
#阿里云电话主账号AccessKey的ID
ALY_DH_AccessKeyId=xxxxxxxxxxxxxxxxxxxxxx
#阿里云电话接口密钥
ALY_DH_AccessSecret=xxxxxxxxxxxxxxxxxxxxxx
#阿里云电话被叫显号,必须是已购买的号码
ALY_DX_CalledShowNumber=xxxxxxxxx
#阿里云电话文本转语音(TTS)模板ID
ALY_DH_TtsCode=xxxxxxxx

#---------------------↓容联云接口-----------------------
#是否开启容联云电话告警通道,可同时开始多个通道0为关闭,1为开启
open-rlydh=0
#容联云基础接口地址
RLY_URL=https://app.cloopen.com:8883/2013-12-26/Accounts/
#容联云后台SID
RLY_ACCOUNT_SID=xxxxxxxxxxx
#容联云api-token
RLY_ACCOUNT_TOKEN=xxxxxxxxxx
#容联云app_id
RLY_APP_ID=xxxxxxxxxxxxx

#---------------------↓邮件配置-----------------------
#是否开启邮件
open-email=1
#邮件发件服务器地址
Email_host=smtp.qq.com
#邮件发件服务器端口
Email_port=465
#邮件帐号
Email_user=1026013275@qq.com
#邮件密码
Email_password=bhyblovqoxwlbbbd
#邮件标题
Email_title=运维告警
#默认发送邮箱
Default_emails=zxw@xxjsjl.cn,zhangxuewen@ghgame.cn

#---------------------↓七陌云接口-----------------------
#是否开启七陌短信告警通道,可同时开始多个通道0为关闭,1为开启
open-7moordx=0
#七陌账户ID
7MOOR_ACCOUNT_ID=Nxxx
#七陌账户APISecret
7MOOR_ACCOUNT_APISECRET=xxx
#七陌账户短信模板编号
7MOOR_DX_TEMPLATENUM=n
#注意:七陌短信变量这里只用一个var1,在代码里写死了。
#-----------
#是否开启七陌webcall语音通知告警通道,可同时开始多个通道0为关闭,1为开启
open-7moordh=0
#请在七陌平台添加虚拟服务号、文本节点
#七陌账户webcall的虚拟服务号
7MOOR_WEBCALL_SERVICENO=xxx
# 文本节点里被替换的变量,我配置的是text。如果被替换的变量不是text,请修改此配置
7MOOR_WEBCALL_VOICE_VAR=text

#---------------------↓telegram接口-----------------------
#是否开启telegram告警通道,可同时开始多个通道0为关闭,1为开启
open-tg=0
#tg机器人token
TG_TOKEN=xxxxx
#tg消息模式 个人消息或者频道消息 0为关闭(推送给个人),1为开启(推送给频道)
TG_MODE_CHAN=0
#tg用户ID
TG_USERID=xxxxx
#tg频道name或者id, 频道name需要以@开始
TG_CHANNAME=xxxxx
#tg api地址, 可以配置为代理地址
#TG_API_PROXY="https://api.telegram.org/bot%s/%s"

#---------------------↓workwechat接口-----------------------
#是否开启workwechat告警通道,可同时开始多个通道0为关闭,1为开启
open-workwechat=0
# 企业ID
WorkWechat_CropID=xxxxx
# 应用ID
WorkWechat_AgentID=xxxx
# 应用secret
WorkWechat_AgentSecret=xxxx
# 接受用户
WorkWechat_ToUser="zhangsan|lisi"
# 接受部门
WorkWechat_ToParty="ops|dev"
# 接受标签
WorkWechat_ToTag=""
# 消息类型, 暂时只支持markdown
# WorkWechat_Msgtype = "markdown"

#---------------------↓百度云接口-----------------------
#是否开启百度云短信告警通道,可同时开始多个通道0为关闭,1为开启
open-baidudx=0
#百度云短信接口AK(ACCESS_KEY_ID)
BDY_DX_AK=xxxxx
#百度云短信接口SK(SECRET_ACCESS_KEY)
BDY_DX_SK=xxxxx
#百度云短信ENDPOINT(ENDPOINT参数需要用指定区域的域名来进行定义,如服务所在区域为北京,则为)
BDY_DX_ENDPOINT=http://smsv3.bj.baidubce.com
#百度云短信模版ID,根据自己审核通过的模版来填写(模版支持一个参数code:如prometheus告警:{code})
BDY_DX_TEMPLATE_ID=xxxxx
#百度云短信签名ID,根据自己审核通过的签名来填写
TXY_DX_SIGNATURE_ID=xxxxx

#---------------------↓百度Hi(如流)-----------------------
#是否开启百度Hi(如流)告警通道,可同时开始多个通道0为关闭,1为开启
open-ruliu=0
#默认百度Hi(如流)机器人地址
BDRL_URL=https://api.im.baidu.com/api/msg/groupmsgsend?access_token=xxxxxxxxxxxxxx
#百度Hi(如流)群ID
BDRL_ID=123456
#---------------------↓bark接口-----------------------
#是否开启telegram告警通道,可同时开始多个通道0为关闭,1为开启
open-bark=0
#bark默认地址, 建议自行部署bark-server
BARK_URL=https://api.day.app
#bark key, 多个key使用分割
BARK_KEYS=xxxxx
# 复制, 推荐开启
BARK_COPY=1
# 历史记录保存,推荐开启
BARK_ARCHIVE=1
# 消息分组
BARK_GROUP=PrometheusAlert

#---------------------↓语音播报-----------------------
#语音播报需要配合语音播报插件才能使用
#是否开启语音播报通道,0为关闭,1为开启
open-voice=1
VOICE_IP=127.0.0.1
VOICE_PORT=9999

#---------------------↓飞书机器人应用-----------------------
#是否开启feishuapp告警通道,可同时开始多个通道0为关闭,1为开启
open-feishuapp=1
# APPID
FEISHU_APPID=cli_xxxxxxxxxxxxx
# APPSECRET
FEISHU_APPSECRET=xxxxxxxxxxxxxxxxxxxxxx
# 可填飞书 用户open_id、user_id、union_ids、部门open_department_id
AT_USER_ID="xxxxxxxx"


#---------------------↓告警组-----------------------
# 有其他新增的配置段,请放在告警组的上面
# 暂时仅针对 PrometheusContronller 中的 /prometheus/alert 路由
# 告警组如果放在了 wx, dd... 那部分的上分,beego section 取 url 值不太对。
# 所以这里使用 include 来包含另告警组配置

# 是否启用告警组功能
open-alertgroup=0

# 自定义的告警组既可以写在这里,也可以写在单独的文件里。
# 写在单独的告警组配置里更便于修改。
# include "alertgroup.conf"

#---------------------↓kafka地址-----------------------
# kafka服务器的地址
open-kafka=1
kafka_server = 127.0.0.1:9092
# 写入消息的kafka topic
kafka_topic = devops
# 用户标记该消息是来自PrometheusAlert,一般无需修改

4.6、配置PrometheusAlert告警模板

image-20240909183708603

模板内容:

{{ $var := .externalURL}}{{ range $k,$v:=.alerts -}}
{{ if eq $v.status "resolved" -}}
## [武汉监控恢复信息]($v.generatorURL)💨

##### 🌟<font color="#02b340">【恢复名称】✅</font>[{{$v.labels.alertname}}]({{$var}})✅{{ if $v.labels.severity }}
##### 🌟<font color="#02b340">【恢复级别】</font>
{{- if eq $v.labels.severity "info" }}info 
{{- else if eq $v.labels.severity "warning" }}warning
{{- else if eq $v.labels.severity "critical" }}critical
{{- else if eq $v.labels.severity "error" }}error 
{{ else }}{{ $v.labels.severity }}
{{ end -}}
{{ end }}
##### 🌟<font color="#02b340">【当前状态】</font><font color="#67C23A">已恢复</font>
##### 🌟<font color="#02b340">【开始时间】</font>{{GetCSTtime $v.startsAt}}
##### 🌟<font color="#02b340">【结束时间】</font>{{GetCSTtime $v.endsAt}}
##### 🌟<font color="#02b340">【主机地址】</font>{{$v.labels.instance}}
##### 🌟<font color="#02b340">【主机备注】</font>{{$v.labels.role}}


**<font color="#02b340">{{$v.annotations.description}}</font>**
{{ else -}}
## [武汉监控告警信息]($v.generatorURL)💨

##### 🌟<font color="#FF0000">【告警名称】🔔</font>[{{$v.labels.alertname}}]({{$var}})🔔{{ if $v.labels.severity }}
##### 🌟<font color="#FF0000">【告警级别】</font>
{{- if eq $v.labels.severity "info"}}info
{{- else if eq $v.labels.severity "warning"}}warning 🔥
{{- else if eq $v.labels.severity "critical"}}critical 💔💔
{{- else if eq $v.labels.severity "error"}}error 🔥🔥❌
{{ else }}{{ $v.labels.severity }}
{{ end -}}
{{ end }}
##### 🌟<font color="#FF0000">【当前状态】</font><font color="#E6A23C">需要处理</font>
##### 🌟<font color="#FF0000">【开始时间】</font>{{GetCSTtime $v.startsAt}}
##### 🌟<font color="#FF0000">【主机地址】</font>{{$v.labels.instance}}
##### 🌟<font color="#FF0000">【主机备注】</font>{{$v.labels.role}}

**<font color="#E6A23C">{{$v.annotations.description}}</font>**💨💨

## ✨[点我去屏蔽告警]({{$var}})✨
{{ end -}}
{{ end }}

4.7、配置Granfana展示

配置中文显示

添加数据源

image-20240909184022672

下滑最后保存。

添加看板,可以去Granfana查找心仪的面板

官方地址:Grafana dashboards

以node为例

复制ID

image-20240909184440810

返回granfa,导入

4.8、企业微信告警反馈

2
广告 广告

评论区