官方文档地址:https://prometheus.io/docs/introduction/overview/javascript
~]# useradd -r -m -d /var/lib/prometheus prometheus
wget https://github.com/prometheus/prometheus/releases/download/v2.14.0/prometheus-2.14.0.linux-amd64.tar.gz tar -xf prometheus-2.14.0.linux-amd64.tar.gz -C /usr/local/ cd /usr/local ln -sv prometheus-2.14.0.linux-amd64 prometheus
vim /usr/lib/systemd/system/prometheus.service [Unit] Description=The Prometheus 2 monitoring system and time series database. Documentation=https://prometheus.io After=network.target [Service] EnvironmentFile=-/etc/sysconfig/prometheus User=prometheus ExecStart=/usr/local/prometheus/prometheus \ --storage.tsdb.path=/home/prometheus/prometheus \ --config.file=/usr/local/prometheus/prometheus.yml \ --web.listen-address=0.0.0.0:9090 \ --web.external-url= $PROM_EXTRA_ARGS Restart=on-failure StartLimitInterval=1 RestartSec=3 [Install] WantedBy=multi-user.target
其余运行时参数: ./prometheus --helpcss
systemctl daemon-reload systemctl start prometheus.service
iptables -I INPUT -p tcp --dport 9090 -s NETWORK/MASK -j ACCEPT
http://IP:PORT
$ docker run --name prometheus -d -v ./prometheus:/etc/prometheus/ -v ./db/:/prometheus -p 9090:9090 prom/prometheus --config.file=/etc/prometheus/prometheus.yml --web.listen-address="0.0.0.0:9090" --storage.tsdb.path=/prometheus --web.console.libraries=/usr/share/prometheus/console_libraries --web.console.templates=/usr/share/prometheus/consoles --storage.tsdb.retention=30d
--config.file=/etc/prometheus/prometheus.yml # 指明主配置文件 --web.listen-address="0.0.0.0:9090" # 指明监听地址端口 --storage.tsdb.path=/prometheus # 指明数据库目录 --web.console.libraries=/usr/share/prometheus/console_libraries --web.console.templates=/usr/share/prometheus/consoles # 指明console lib 和 tmpl --storage.tsdb.retention=60d # 指明数据保留天数,默认15
Prometheus的主配置⽂件为prometheus.ymlhtml
它主要由global、rule_files、scrape_configs、alerting、remote_write和remote_read⼏个配置段组成:前端
- global:全局配置段; - rule_files:指定告警规则文件的路径 - scrape_configs: scrape配置集合,⽤于定义监控的⽬标对象(target)的集合,以及描述如何抓取 (scrape)相关指标数据的配置参数; 一般,每一个scrape配置对应于⼀个单独的做业(job), ⽽每一个targets可经过静态配置(static_configs)直接给出定义,也可基于Prometheus⽀持的服务发现机制进 ⾏⾃动配置;
- job_name: 'nodes' static_configs: # 静态指定,targets中的 host:port/metrics 将会做为metrics抓取对象 - targets: ['localhost:9100'] - targets: ['172.20.94.1:9100']
- job_name: 'docker_host' file_sd_configs: # 基于文件的服务发现,文件中(yml 和json 格式)定义的host:port/metrics将会成为抓取对象 - files: - ./sd_files/docker_host.yml refresh_interval: 30s
可由Prometheus使⽤的Alertmanager实例的集合,以及如何同这些Alertmanager交互的配置参数;java
每一个Alertmanager可经过静态配置(static_configs)直接给出定义, 也可基于Prometheus⽀持的服务发现机制进⾏⾃动配置;node
配置“远程写”机制,Prometheus须要将数据保存于外部的存储系统(例如InfluxDB)时 定义此配置段, 随后Prometheus将样本数据经过HTTP协议发送给由URL指定适配器(Adaptor);
remote_read:mysql
配置“远程读”机制,Prometheus将接收到的查询请求交给由URL指定适配器 (Adpater)执⾏, Adapter将请求条件转换为远程存储服务中的查询请求,并将获取的响应数据转换为Prometheus可⽤的格式;
监控及告警规则配置文件:*.ymllinux
rule_files: - "test_rules.yml" # 指定配置告警规则的文件路径
服务发现定义文件:支持yaml 和 json 两种格式nginx
file_sd_configs: - files: - ./sd_files/http.yml refresh_interval: 30s
global: scrape_interval: 15s #每过15秒抓取一次指标数据 evaluation_interval: 15s#每过15秒执行一次报警规则,也就是说15秒执行一次报警 alerting: alertmanagers: - static_configs: - targets: ["localhost:9093"]# 设置报警信息推送地址 , 通常而言设置的是alertManager的地址 rule_files: - "test_rules.yml" # 指定配置告警规则的文件路径 scrape_configs: - job_name: 'node'#本身定义的监控的job_name static_configs: # 配置静态规则,直接指定抓取的ip:port - targets: ['localhost:9100'] - job_name: 'CDG-MS' honor_labels: true metrics_path: '/prometheus' static_configs: - targets: ['localhost:8089'] relabel_configs: - target_label: env replacement: dev - job_name: 'eureka' file_sd_configs: # 基于文件的服务发现 - files: - "/app/enmonster/basic/prometheus/prometheus-2.2.1.linux-amd64/eureka.json" # 支持json 和yml 两种格式 refresh_interval: 30s # 30s钟自行刷新配置,读取文件,修改以后无需手动reload relabel_configs: - source_labels: [__job_name__] regex: (.*) target_label: job replacement: ${1} - target_label: env replacement: dev
告警规则配置文件示例:git
[root@host40 monitor-bak]# cat prometheus/rules/docker_monitor.yml groups:
name: "container monitor"
rules:
基于文件的服务发现定义文件: *.yml
[root@host40 monitor]# cat prometheus/sd_files/virtual_lan.yml - targets: ['10.10.11.179:9100'] - targets: ['10.10.11.178:9100']
[root@host40 monitor]# cat prometheus/sd_files/tcp.yml - targets: ['10.10.11.178:8001'] labels: server_name: http_download - targets: ['10.10.11.178:3307'] labels: server_name: xiaojing_db - targets: ['10.10.11.178:3001'] labels: server_name: test_web
node_exporter 在被监控节点安装,抓取主机监控信息,并对外提供http服务,供prometheus抓取监控信息。
下载并解压
wget https://github.com/prometheus/node_exporter/releases/download/v0.18.1/node_exporter-0.18.1.linux-amd64.tar.gz tar xf node_exporter-0.18.1.linux-amd64.tar.gz -C /usr/local/ cd /usr/local ln -sv node_exporter-0.18.1.linux-amd64/ node_exporter
建立用户:
useradd -r -m -d /var/lib/prometheus prometheus
配置unit file:
vim /usr/lib/systemd/system/node_exporter.service [Unit] Description=Prometheus exporter for machine metrics, written in Go with pluggable metric collectors.Documentation=https://github.com/prometheus/node_exporterAfter=network.target [Service] EnvironmentFile=-/etc/sysconfig/node_exporter User=prometheus ExecStart=/usr/local/node_exporter/node_exporter \ $NODE_EXPORTER_OPTS Restart=on-failure StartLimitInterval=1 RestartSec=3 [Install] WantedBy=multi-user.target
启动服务:
systemctl daemon-reload systemctl start node_exporter.service
能够手动测试是否能够获取metrics信息:
curl http://localhost:9100/metrics
开启防火墙:
iptables -I INPUT -p tcp --dport 9100 -s NET/MASK -j ACCEPT
image: quay.io/prometheus/node-exporter,prom/node-exporter
启动命令:
docker run -d --net="host" --pid="host" -v "/:/host:ro,rslave" --name monitor-node-exporter --restart always quay.io/prometheus/node-exporter --path.rootfs=/host --web.listen-address=:9100
对于部分低版本的docker,出现报错:Error response from daemon: linux mounts: Could not find source mount of /
解决办法:-v "/:/host:ro,rslave" -> -v "/:/host:ro"
开启关闭collectors:
./node_exporter --help # 查看支持的全部collectors,可根据实际需求 enable 和 disabled 各项指标收集
如 --collector.cpu=disabled ,再也不收集cpu相关信息
Textfile Collector: 文本文件收集器
经过 启动参数 --collector.textfile.directory="DIR" 可开启文本文件收集器 收集器会收集目录下全部*.prom的文件中的指标,指标必须知足 prom格式
示例:
echo my_batch_job_completion_time $(date +%s) > /path/to/directory/my_batch_job.prom.$$ mv /path/to/directory/my_batch_job.prom.$$ /path/to/directory/my_batch_job.prom echo 'role{role="application_server"} 1' > /path/to/directory/role.prom.$$ mv /path/to/directory/role.prom.$$ /path/to/directory/role.prom rpc_duration_seconds{quantile="0.5"} 4773 http_request_duration_seconds_bucket{le="0.5"} 129389
即若是node_exporter 不能知足自身指标抓取,能够经过脚本形式将指标抓取以后写入文件,由node_exporter对外提供个prometheus抓取
能够省掉pushgateway
示例: prometheus.yml
scrape_configs: # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
static_configs:
job_name: 'nodes'
static_configs:
- job_name: 'node_real_lan' file_sd_configs: - files:
sudo docker run \ --volume=/:/rootfs:ro \ --volume=/var/run:/var/run:ro \ --volume=/sys:/sys:ro \ --volume=/var/lib/docker/:/var/lib/docker:ro \ --volume=/dev/disk/:/dev/disk:ro \ --publish=9080:8080 \ --detach=true \ --name=cadvisor \ --privileged \ --device=/dev/kmsg \ google/cadvisor:v0.33.0
配置示例:
- job_name: 'docker' static_configs:
下载并安装
wget https://dl.grafana.com/oss/release/grafana-7.2.2-1.x86_64.rpm sudo yum install grafana-7.2.2-1.x86_64.rpm
准备service 文件:
[Unit] Description=Grafana instance Documentation=http://docs.grafana.org Wants=network-online.target After=network-online.target After=postgresql.service mariadb.service mysqld.service [Service] EnvironmentFile=/etc/sysconfig/grafana-server User=grafana Group=grafana Type=notify Restart=on-failure WorkingDirectory=/usr/share/grafana RuntimeDirectory=grafana RuntimeDirectoryMode=0750 ExecStart=/usr/sbin/grafana-server \ --config=${CONF_FILE} \ --pidfile=${PID_FILE_DIR}/grafana-server.pid\ --packaging=rpm \ cfg:default.paths.logs=${LOG_DIR} \ cfg:default.paths.data=${DATA_DIR} \ cfg:default.paths.plugins=${PLUGINS_DIR} \ cfg:default.paths.provisioning=${PROVISIONING_CFG_DIR} LimitNOFILE=10000 TimeoutStopSec=20 [Install] WantedBy=multi-user.target
启动grafana
systemctl enable grafana-server.service systemctl restart grafana-server.service
默认监听3000端口
开启防火墙:
iptables -I INPUT -p tcp --dport 3000 -s NET/MASK -j ACCEPT
image: grafana/grafana
docker run -d --name=grafana -p 3000:3000 grafana/grafana:7.2.2
web页面访问:
http://ip:port
首次登录会要求自行设置帐号密码
7.2版本会要求输入帐号密码以后重置,初始帐号密码都是admin
使用流程:
经常使用模板编号:
重置管理员密码:
查看Grafana配置文件,肯定grafana.db的路径 配置文件路径:/etc/grafana/grafana.ini [paths] ;data = /var/lib/grafana [database] # For "sqlite3" only, path relative to data_path setting ;path = grafana.db 经过配置文件得知grafana.db的完整路径以下: /var/lib/grafana/grafana.db
使用sqlites修改admin密码 sqlite3 /var/lib/grafana/grafana.db sqlite> update user set password = '59acf18b94d7eb0694c61e60ce44c110c7a683ac6a8f09580d626f90f4a242000746579358d77dd9e570e83fa24faa88a8a6', salt = 'F3FAxVm33R' where login = 'admin'; .exit
使用admin admin 登陆
grafana-server配置 smtp服务器,配置发件邮箱
vim /etc/grafana/grafana.ini [smtp] enabled = true host = smtp.126.com:465 user = USER@126.com password = PASS skip_verify = false from_address = USER@126.com from_name = Grafana Alart
grafana页面添加Notification Channel
Alerting -> Notification Channel save以前 能够send test
进入dashboard,添加alart rules
prometheus用来查询数据库的语法规则,用来将数据库中存储的由各exporter 采集到的metric指标组织成可视化的图标信息,以及告警规则
gauges: 返回单一数值,如:
node_boot_time_seconds{instance="10.10.11.40:9100",job="node_real_lan"} 1574040030
counters: 计数,
histograms: 直方图,统计数据的分布状况。好比最大值,最小值,中间值,中位数,百分位数等。
node_boot_time_seconds{instance="10.10.11.40:9100",job="node_real_lan"}
如上示例,这里的instance,和job 就是label
也能够在配置文件自行定义label,如:
- targets: ['10.10.11.178:3001'] labels: server_name: test_web
添加的label即会在prometheus查询数据使用:
metric{servername=...,}
计算cpu使用率:
(1-((sum(increase(node_cpu_seconds_total{mode="idle"}[1m])) by (instance))/(sum(increase(node_cpu_seconds_total[1m])) by (instance)))) * 100
其中metric:
node_cpu_seconds_total # 总cpu 使用时间 node_cpu_seconds_total{mode="idle"} # 空闲cpu使用时间,其余相似标签: user , system , steal , softirq , irq , nice , iowait , idle
用到的函数:
increase( [1m]) # 1分钟之类的增量。 sum() sum() by (TAG) # 其中 TAG 是标签,此地 instance 表明的是机器名. 按主机名进行相加,不然多主机只会显示一条线。
匹配运算:
= #等于 Select labels that are exactly equal to the provided string. != #不等于 Select labels that are not equal to the provided string. =~ #正则表达式匹配 Select labels that regex-match the provided string. !~ #正则表达式不匹配 Select labels that do not regex-match the provided string.
示例:
node_cpu_seconds_total{mode="idle"} # mode : 标签,metric自带属性。 api_http_requests_total{method="POST", handler="/messages"}
http_requests_total{environment=~"staging|testing|development",method!="GET"}
注意: 必须指定一个名称或至少一个与空字符串不匹配的标签匹配器
{job=~".*"} # Bad! {job=~".+"} # Good! {job=~".*",method="get"} # Good!
时间范围:
s -秒 m - 分钟 h - 小时 d - 天 w -周 y -年
运算符:
+ (addition) - (subtraction) * (multiplication) / (division) % (modulo) ^ (power/exponentiatio == (equal) != (not-equal) > (greater-than) < (less-than) >= (greater-or-equal) <= (less-or-equal) and (intersection) or (union) unless (complement)
集合运算符:
sum (calculate sum over dimensions) min (select minimum over dimensions) max (select maximum over dimensions) avg (calculate the average over dimensions) stddev (calculate population standard deviation over dimensions) stdvar (calculate population standard variance over dimensions) count (count number of elements in the vector) count_values (count number of elements with the same value) bottomk (smallest k elements by sample value) topk (largest k elements by sample value) quantile (calculate φ-quantile (0 ≤ φ ≤ 1) over dimensions)
sum() by (instance) #求和(根据条件求和)
increase() # 取增量,针对counter类型
示例:
increase(node_network_receive_bytes_total[30s]) # 接受流量
rate() # 专门搭配counter类型数据使用的函数,按照设置的一个时间段,取counter在这个时间段中的平均每秒的增量
示例:
rate(node_network_receive_bytes_total[30s])*8 # 入口带宽
topk() # 给定数字x,根据数值排序以后去最高的x个数
示例:
topk(5,node_cpu_seconds_total) # 取node_cpu_seconds_total 最长的前5个 topk(5,increase(node_network_receive_bytes_total[10m])) # 10m钟以内收到的流量前5
注意:
会形成图像散列。 console中执行,取一次值。
count() # 计数,如 count(node_load1 > 5)
avg() by () # 取均值,by(label)
配置规则以将抓取的数据汇总到新的时间序列中
示例:
将如下规则,记录进prometheus.rules.yml文件
avg(rate(rpc_durations_seconds_count[5m])) by (job, service)
记录进prometheus.rules.yml文件中
groups: - name: example rules:
record: job_service:rpc_durations_seconds_count:avg_rate5m
expr: avg(rate(rpc_durations_seconds_count[5m])) by (job, service)
在prometheus.yml文件中:
rule_files:
至关于生产了一个新的matric,不过此matric不是抓取来的,而是计算来的。
文档地址:https://prometheus.io/docs/alerting/latest/configuration/
下载并安装:
wget https://github.com/prometheus/alertmanager/releases/download/v0.20.0/ alertmanager-0.20.0.linux-amd64.tar.gz tar -xf alertmanager-0.20.0.linux-amd64.tar.gz -C /usr/local cd /usr/local && ln -sv alertmanager-0.20.0.linux-amd64/ alertmanager && cd alertmanager
启动:
nohup ./alertmanager --config.file="alertmanager.yml" --storage.path="data/ --web.listen-address=":9093" &
image: prom/alertmanager
docker run
docker run -dit --name monitor-alertmanager -v ./alertmanager/db/:/alertmanager -v ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml ./alertmanager/templates/:/etc/alertmanager/templates -p 9093:9093 --restart always --privileged true prom/alertmanager --config.file="/etc/alertmanager/alertmanager.yml" --storage.path="/alertmanager --web.listen-address=":9093"
grouping: 分组
示例:
发生网络分区时,群集中正在运行数十个或数百个服务实例。您有一半的服务实例再也不能够访问数据库。 Prometheus中的警报规则配置为在每一个服务实例没法与数据库通讯时为其发送警报。结果,数百个警报被发送到Alertmanager。 做为用户,人们只但愿得到一个页面,同时仍然可以准确查看受影响的服务实例。所以,能够将Alertmanager配置为按警报的群集和 警报名称分组警报,以便它发送一个紧凑的通知。
Inhibition: 抑制
抑制是一种概念,若是某些其余警报已经触发,则抑制某些警报的通知。
正在触发警报,通知您没法访问整个群集。能够将Alertmanager配置为使与该群集有关的全部其余警报静音。这样能够防止与实际问题无关的数百或数千个触发警报的通知。
Silences: 静默
alerting:
alerting: alertmanagers:
static_configs:
rule_files:
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files:
"rules/*.yml"
scrape_configs:
scrape_configs:
示例:
[root@xiang-03 /usr/local/prometheus]#cat rules/node.yml groups:
name: "system info"
rules:
主配置文件中须要配置:
先看示例:
vim alertmanager.yml global: smtp_smarthost: 'xxx' smtp_from: 'xxx' smtp_auth_username: 'xxx' smtp_auth_password: 'xxx' smtp_require_tls: false templates:
to: 'xxx@xx.xx'
html: '{{ template "xx.html" . }}'
headers: { Subject: " {{ 第二路由匹配测试}}" }
vim test.tmpl
{{ define "xx.html" }}
<table border="5">
<tr><td>报警项</td>
<td>磁盘</td>
<td>报警阀值</td>
<td>开始时间</td>
</tr>
{{ range $i, $alert := .Alerts }}
<tr><td>{{ index $alert.Labels "alertname" }}</td>
<td>{{ index $alert.Labels "instance" }}</td>
<td>{{ index $alert.Annotations "value" }}</td>
<td>{{ $alert.StartsAt }}</td>
</tr>
{{ end }}
</table>
{{ end }}
gloable:
resolve_timeout: # 在没有报警的状况下声明为已解决的时间
route: # 全部报警信息进入后的根路由,用来设置报警的分发策略
group_by: ['LABEL_NAME','alertname', 'cluster','job','instance',...]
这里的标签列表是接收到报警信息后的从新分组标签,例如,接收到的报警信息里面有许多具备 cluster=A
和alertname=LatncyHigh 这样的标签的报警信息将会批量被聚合到一个分组里面
group_wait: 30s
当一个新的报警分组被建立后,须要等待至少group_wait时间来初始化通知,这种方式能够确保您能有足够的时间为同一分组来获取多个警报,而后一块儿触发这个报警信息。
group_interval: 5m
当第一个报警发送后,等待'group_interval'时间来发送新的一组报警信息。
repeat_interval: 5m
若是一个报警信息已经发送成功了,等待'repeat_interval'时间来从新发送他们
match: label_name: NAME
匹配报警规则,知足条件的告警将被发给 receiver
match_re: label_name: <regex>, ...
正则表达式匹配。知足条件的告警将被发给 receiver
receiver: receiver_name
将知足match 和 match_re的告警发给后端 告警媒介(邮件,webhook,pagerduty,wechat,...)
必须有一个default receivererr="root route must specify a default receiver"
routes: - <route> ...
配置多条规则。
templates: [ - <filepath> ... ]
配置模板,好比邮件告警页面模板
receivers: - <receiver> ...# 列表
- name: receiver_name # 用于填写在route.receiver中的名字
email_configs: # 配置邮件告警
- to: <tmpl_string> send_resolved: <boolean> | default = false # 故障恢复以后,是否发送恢复通知
配置接受邮件告警的邮箱,也能够配置单独配置发件邮箱。 详见官方文档
https://prometheus.io/docs/alerting/latest/configuration/#email_config
- name: ... wechat_configs: - send_resolved: <boolean> | default = false api_secret: <secret> | default = global.wechat_api_secret api_url: <string> | default = global.wechat_api_url corp_id: <string> | default = global.wechat_api_corp_id message: <tmpl_string> | default = '{{ template "wechat.default.message" . }}' agent_id: <string> | default = '{{ template "wechat.default.agent_id" . }}' to_user: <string> | default = '{{ template "wechat.default.to_user" . }}' to_party: <string> | default = '{{ template "wechat.default.to_party" . }}' to_tag: <string> | default = '{{ template "wechat.default.to_tag" . }}' # 说明 to_user: 企业微信用户ID to_party: 须要发送的组id corp_id: 企业微信帐号惟一ID 能够在 个人企业 查看 agent_id: 应用的 ID,应用管理 --> 打开自定应用查看 api_secret: 应用的密钥 打开企业微信注册 https://work.weixin.qq.com 微信API官方文档 https://work.weixin.qq.com/api/doc#90002/90151/90854
企业微信告警配置
inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'dev', 'instance']
抑制相关配置
注册企业: https://work.weixin.qq.com
能够注册未认证企业,人数上限200,绑定我的微信便可使用web后台
微信API官方文档 : https://work.weixin.qq.com/api/doc#90002/90151/90854
注册以后绑定私人微信便可扫码进入管理后台。
发送告警的应用须要新建,操做也很简单
须要注意的参数:
receivers: - name: 'default' email_configs: - to: 'XXX' send_resolved: true wechat_configs: - send_resolved: true corp_id: 'XXX' api_secret: 'XXX' agent_id: 1000002 to_user: XXX to_party: 2 message: '{{ template "wechat.html" . }}'
template:
因为alertmanager默认的微信报警模板太丑丑陋和冗长,因此使用告警模板,邮件模板默认的却是还能够
cat wechat.tmpl {{ define "wechat.html" }} {{- if gt (len .Alerts.Firing) 0 -}}{{ range .Alerts }} [@警报~] 实例: {{ .Labels.instance }} 信息: {{ .Annotations.summary }} 详情: {{ .Annotations.description }} 值: {{ .Annotations.value }} 时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} {{ end }}{{ end -}} {{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }} [@恢复~] 实例: {{ .Labels.instance }} 信息: {{ .Annotations.summary }} 时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} 恢复: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} {{ end }}{{ end -}} {{- end }}
参考来源: https://blog.csdn.net/knight_zhou/article/details/106323719
Prometheus 邮件告警自定义模板的默认使用的是utc时间。
触发时间: {{ .StartsAt.Format "2020-01-02 15:04:05" }} 修改以后:{{ (.StartsAt.Add 28800e9).Format "2020-01-02 15:04:05" }}
vim rules/docker_monitor.yml groups: - name: "container monitor" rules: - alert: "Container down: env1" expr: time() - container_last_seen{name="env1"} > 60 for: 30s labels: severity: critical annotations: summary: "Container down: {{$labels.instance}} name={{$labels.name}}"
注意:
此项指标只能监控容器down 掉,没法准确监控容器恢复(不许),即使容器没有成功启动,过一段时间,也会受到resolve通知
groups: - name: 主机状态-监控告警 rules: - alert: 主机状态 expr: up == 0 for: 1m labels: status: 很是严重 annotations: summary: "{{$labels.instance}}:服务器宕机" description: "{{$labels.instance}}:服务器延时超过5分钟" - alert: CPU使用状况 expr: 100-(avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) for: 1m labels: status: 通常告警 annotations: summary: "{{$labels.mountpoint}} CPU使用率太高!" description: "{{$labels.mountpoint }} CPU使用大于60%(目前使用:{{$value}}%)" - alert: cpu使用率太高告警 # 查询提供了hostname label expr: (100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 10 nodename) (node_uname_info) > 85 for: 5m labels: region: 成都 annotations: summary: "{{$labels.instance}}({{$labels.nodename}})CPU使用率太高!" description: '服务器{{$labels.instance}}({{$labels.nodename}})CPU使用率超过85%( $value}}%)' - alert: 系统负载太高 expr: (node_load1/count without (cpu, mode) (node_cpu_seconds_total{mode="system"} nodename) (node_uname_info)>1.1 for: 3m labels: region: 成都 annotations: summary: "{{$labels.instance}}({{$labels.nodename}})系统负载太高!" description: '{{$labels.instance}}({{$labels.nodename}})当前负载超标率 {{printf - alert: 内存不足告警 expr: (100 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100)* o nodename) (node_uname_info) > 80 for: 3m labels: region: 成都 annotations: summary: "{{$labels.instance}}({{$labels.nodename}})内存使用率太高!" description: '服务器{{$labels.instance}}({{$labels.nodename}})内存使用率超过80%( $value}}%)' - alert: IO操做耗时 expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < for: 1m labels: status: 严重告警 annotations: summary: "{{$labels.mountpoint}} 流入磁盘IO使用率太高!" description: "{{$labels.mountpoint }} 流入磁盘IO大于60%(目前使用:{{$value}})" - alert: 网络流入 expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|do instance)) / 100) > 102400 for: 1m labels: status: 严重告警 annotations: summary: "{{$labels.mountpoint}} 流入网络带宽太高!" description: "{{$labels.mountpoint }}流入网络带宽持续2分钟高于100M. RX带宽使用率{ - alert: 网络流出 expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|d instance)) / 100) > 102400 for: 1m labels: status: 严重告警 annotations: summary: "{{$labels.mountpoint}} 流出网络带宽太高!" description: "{{$labels.mountpoint }}流出网络带宽持续2分钟高于100M. RX带宽使用率{ - alert: network in expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1 for: 1m labels: name: network severity: Critical annotations: summary: "{{$labels.mountpoint}} 流入网络带宽太高" description: "{{$labels.mountpoint }}流入网络异常,高于100M" value: "{{ $value }}" - alert: network out expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / for: 1m labels: name: network severity: Critical annotations: summary: "{{$labels.mountpoint}} 发送网络带宽太高" description: "{{$labels.mountpoint }}发送网络异常,高于100M" value: "{{ $value }}" - alert: TCP会话 expr: node_netstat_Tcp_CurrEstab > 1000 for: 1m labels: status: 严重告警 annotations: summary: "{{$labels.mountpoint}} TCP_ESTABLISHED太高!" description: "{{$labels.mountpoint }} TCP_ESTABLISHED大于1000%(目前使用:{{$valu - alert: 磁盘容量 expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_b > 80 for: 1m labels: status: 严重告警 annotations: summary: "{{$labels.mountpoint}} 磁盘分区使用率太高!" description: "{{$labels.mountpoint }} 磁盘分区使用大于80%(目前使用:{{$value}}%)" - alert: 硬盘空间不足告警 # 查询结果多了hostname等label expr: (100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_by )* on(instance) group_left(nodename) (node_uname_info)> 80 for: 3m labels: region: 成都 annotations: summary: "{{$labels.instance}}({{$labels.nodename}})硬盘使用率太高!" description: '服务器{{$labels.instance}}({{$labels.nodename}})硬盘使用率超过80%( $value}}%)' - alert: volume fullIn fourdaysd # 预计磁盘4天后写满 expr: predict_linear(node_filesystem_free_bytes[2h], 4 * 24 * 3600) < 0 for: 5m labels: name: disk severity: Critical annotations: summary: "{{$labels.mountpoint}} 预计主机可用磁盘空间4天后将写满" description: "{{$labels.mountpoint }}" value: "{{ $value }}%" - alert: disk write rate expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 for: 1m labels:container_memory_max_usage_bytes name: disk severity: Critical annotations: summary: "disk write rate (instance {{ $labels.instance }})" description: "磁盘写入速率大于50MB/s" value: "{{ $value }}%" - alert: disk read latency expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_complet for: 1m labels: name: disk severity: Critical annotations: summary: "unusual disk read latency (instance {{ $labels.instance }})" description: "磁盘读取延迟大于100毫秒" value: "{{ $value }}%" - alert: disk write latency expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_compl for: 1m labels: name: disk severity: Critical annotations: summary: "unusual disk write latency (instance {{ $labels.instance }})" description: "磁盘写入延迟大于100毫秒" value: "{{ $value }}%"
GET /-/healthy GET /-/ready POST /-/reload
curl -u monitor:fosafer.com 127.0.0.1:9093/-/healthy OK curl -XPOST -u monitor:fosafer.com 127.0.0.1:9093/-/reload [root@host40 monitor]# curl -XPOST -u monitor:fosafer.com 127.0.0.1:9093/-/reload failed to reload config: yaml: unmarshal errors: line 26: field receiver already set in type config.plain
等同: docker exec -it monitor-alertmanager kill -1 1 ,可是失败会报错
blackbox_exporter是Prometheus 官方提供的 exporter 之一,能够提供 http、dns、tcp、icmp 的监控数据采集。
应用场景:
HTTP 测试 定义 Request Header 信息 判断 Http status / Http Respones Header / Http Body 内容 TCP 测试 业务组件端口状态监听 应用层协议定义与监听 ICMP 测试 主机探活机制 POST 测试 接口联通性 SSL 证书过时时间
下载并解压
wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/ blackbox_exporter-0.18.0.linux-amd64.tar.gz tar -xf blackbox_exporter-0.18.0.linux-amd64.tar.gz -C /usr/local/ cd /usr/local ln -sv blackbox_exporter-0.18.0.linux-amd64 blackbox_exporter cd blackbox_exporter ./blackbox_exporter --version
添加systemd服务unit:
vim /lib/systemd/system/blackbox_exporter.service [Unit] Description=blackbox_exporter After=network.target [Service] User=root Type=simple ExecStart=/usr/local/blackbox_exporter/blackbox_exporter --config.file=/usr/local/blackbox_exporter/blackbox.yml Restart=on-failure [Install] WantedBy=multi-user.target
systemctl daemon-reload systemctl enable blackbox_exporter systemctl start blackbox_exporter
image: prom/blackbox-exporter:master
docker run:
docker run --rm -d -p 9115:9115 --name blackbox_exporter -v `pwd`:/config prom/blackbox-exporter:master --config.file=/config/blackbox.yml
默认配置文件:
blackbox_exporter 默认状况配置文件已经可以知足大多数需求,后续如需自行配置,参见官方文档,以及项目类一个示例配置文件
cat blackbox.yml modules: http_2xx: prober: http http_post_2xx: prober: http http: method: POST tcp_connect: prober: tcp pop3s_banner: prober: tcp tcp: query_response: - expect: "^+OK" tls: true tls_config: insecure_skip_verify: false ssh_banner: prober: tcp tcp: query_response: - expect: "^SSH-2.0-" irc_banner: prober: tcp tcp: query_response: - send: "NICK prober" - send: "USER prober prober prober :prober" - expect: "PING :([^ ]+)" send: "PONG ${1}" - expect: "^:[^ ]+ 001" icmp: prober: icmp
官方介绍: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config
说明:
labels: job: job_name __address__: <host>:<port> instance: 默认__address__,若是没有被从新标签的话 __scheme__: scheme __metrics_path__: path __param_<name>: url 中第一个出现的 <name> 参数
scrape_configs: - job_name: 'blackbox' metrics_path: /probe params: module: [http_2xx] # Look for a HTTP 200 response. static_configs: - targets: - http://prometheus.io # Target to probe with http. - https://prometheus.io# Target to probe with https. - http://example.com:8080 # Target to probe with http on port 8080. relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
- job_name: "blackbox_telnet_port]" scrape_interval: 5s metrics_path: /probe params: module: [tcp_connect] static_configs: - targets: [ '1x3.x1.xx.xx4:443' ] labels: group: 'xxxidc机房ip监控' - targets: ['10.xx.xx.xxx:443'] labels: group: 'Process status of nginx(main) server' relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 10.xxx.xx.xx:9115
- job_name: 'blackbox00_ping_idc_ip' scrape_interval: 10s metrics_path: /probe params: module: [icmp] #ping static_configs: - targets: [ '1x.xx.xx.xx' ] labels: group: 'xxnginx 虚拟IP' relabel_configs: - source_labels: [__address__] regex: (.*)(:80)? target_label: __param_target replacement: ${1} - source_labels: [__param_target] regex: (.*) target_label: ping replacement: ${1} - source_labels: [] regex: .* target_label: __address__ replacement: 1x.xxx.xx.xx:9115
- job_name: 'blackbox_http_2xx_post' scrape_interval: 10s metrics_path: /probe params: module: [http_post_2xx_query] static_configs: - targets: - https://xx.xxx.com/api/xx/xx/fund/query.action labels: group: 'Interface monitoring' relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 1x.xx.xx.xx:9115 # The blackbox exporter's real hostname:port.
cat << 'EOF' > prometheus.yml rule_files: - ssl_expiry.rules scrape_configs: - job_name: 'blackbox' metrics_path: /probe params: module: [http_2xx] # Look for a HTTP 200 response. static_configs: - targets: - example.com # Target to probe relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 127.0.0.1:9115 # Blackbox exporter. EOF cat << 'EOF' > ssl_expiry.rules groups: - name: ssl_expiry.rules rules: - alert: SSLCertExpiringSoon expr: probe_ssl_earliest_cert_expiry{job="blackbox"} - time() < 86400 * 30 for: 10m EOF
相似于:
curl http://172.16.10.65:9115/probe?target=prometheus.io&module=http_2xx&debug=true
icmp、tcp、http、post 监测是否正常能够观察probe_success 这一指标
probe_success == 0 ##联通性异常 probe_success == 1 ##联通性正常
告警也是判断这个指标是否等于0,如等于0 则触发异常报警
[sss@prometheus01 prometheus]$ cat rules/blackbox-alert.rules groups: - name: blackbox_network_stats rules:
prometheus alertmanager grafana nginx node_exporter cadvisor blackbox_exporter
prom/prometheus prom/alertmanager quay.io/prometheus/node-exporter ,prom/node-exporter gcr.io/google_containers/cadvisor[:v0.36.0] # 须要能访问google google/cadvisor:v0.33.0 # docker hub镜像,版本没有google的新 grafana/grafana nginx
将iamge pull下来以后重新tag ,并上传至本地harbor 仓库
image: 10.10.11.40:80/base/nginx:1.19.3 image: 10.10.11.40:80/base/prometheus:2.22.0 image: 10.10.11.40:80/base/grafana:7.2.2 image: 10.10.11.40:80/base/alertmanager:0.21.0 image: 10.10.11.40:80/base/node_exporter:1.0.1 image: 10.10.11.40:80/base/cadvisor:v0.33.0 image: 10.10.11.40:80/base/blackbox-exporter:0.18.0
目录结构一览
mkdir /home/deploy/monitor cd /home/deploy/monitor
[root@host40 monitor]# tree . ├── alertmanager │ ├── alertmanager.yml │ ├── db │ │ ├── nflog │ │ └── silences │ └── templates │ └── wechat.tmpl ├── blackbox_exporter │ └── blackbox.yml ├── docker-compose.yml ├── grafana │ └── db │ ├── grafana.db │ ├── plugins ... ├── nginx │ ├── auth │ └── nginx.conf ├── node-exporter │ └── textfiles ├── node_exporter_install_docker.sh ├── prometheus │ ├── db │ ├── prometheus.yml │ ├── rules │ │ ├── docker_monitor.yml │ │ ├── system_monitor.yml │ │ └── tcp_monitor.yml │ └── sd_files │ ├── docker_host.yml │ ├── http.yml │ ├── icmp.yml │ ├── real_lan.yml │ ├── real_wan.yml │ ├── sedFDm5Rw │ ├── tcp.yml │ ├── virtual_lan.yml │ └── virtual_wan.yml └── sd_controler.sh
nginx basic认证须要的文件:
[root@host40 monitor-bak]# ls nginx/auth/ -a . .. .htpasswd
部分挂在目录权限:
prometheus,grafana,alertmanager 的 db目录 须要777权限 单独挂在的配置文件 alertmanager.yml,prometheus.yml,nginx.conf 须要 666权限。 若是为了安全起见,建议将配置文件放入专门目录中挂载,并在command 中修改启动参数指定配置文件便可
[root@host40 monitor-bak]# cat docker-compose.yml version: "3" services: nginx: image: 10.10.11.40:80/base/nginx:1.19.3 hostname: nginx container_name: monitor-nginx restart: always privileged: false ports: - 3001:3000 - 9090:9090 - 9093:9093 volumes: - ./nginx/nginx.conf:/etc/nginx/nginx.conf - ./nginx/auth:/etc/nginx/basic_auth networks: monitor: aliases: - nginx logging: driver: json-file options: max-file: '5' max-size: 50m prometheus: image: 10.10.11.40:80/base/prometheus:2.22.0 container_name: monitor-prometheus hostname: prometheus restart: always privileged: true volumes: - ./prometheus/db/:/prometheus/ - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml - ./prometheus/rules/:/etc/prometheus/rules/ - ./prometheus/sd_files/:/etc/prometheus/sd_files/ command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--web.console.libraries=/usr/share/prometheus/console_libraries' - '--web.console.templates=/usr/share/prometheus/consoles' - '--storage.tsdb.retention=60d' networks: monitor: aliases: - prometheus logging: driver: json-file options: max-file: '5' max-size: 50m grafana: image: 10.10.11.40:80/base/grafana:7.2.2 container_name: monitor-grafana hostname: grafana restart: always privileged: true volumes: - ./grafana/db/:/var/lib/grafana networks: monitor: aliases: - grafana logging: driver: json-file options: max-file: '5' max-size: 50m alertmanger: image: 10.10.11.40:80/base/alertmanager:0.21.0 container_name: monitor-alertmanager hostname: alertmanager restart: always privileged: true volumes: - ./alertmanager/db/:/alertmanager - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml - ./alertmanager/templates/:/etc/alertmanager/templates networks: monitor: aliases: - alertmanager logging: driver: json-file options: max-file: '5' max-size: 50m node-exporter: image: 10.10.11.40:80/base/node_exporter:1.0.1 container_name: monitor-node-exporter hostname: host40 restart: always privileged: true volumes: - /:/host:ro,rslave - ./node-exporter/textfiles/:/textfiles network_mode: "host" command: - '--path.rootfs=/host' - '--web.listen-address=:9100' - '--collector.textfile.directory=/textfiles' logging: driver: json-file options: max-file: '5' max-size: 50m cadvisor: image: 10.10.11.40:80/base/cadvisor:v0.33.0 container_name: monitor-cadvisor hostname: cadvisor restart: always privileged: true volumes: - /:/rootfs:ro - /var/run:/var/run:ro - /sys:/sys:ro - /var/lib/docker/:/var/lib/docker:ro - /dev/disk/:/dev/disk:ro ports: - 9080:8080 networks: monitor: logging: driver: json-file options: max-file: '5' max-size: 50m blackbox_exporter: image: 10.10.11.40:80/base/blackbox-exporter:0.18.0 container_name: monitor-blackbox hostname: blackbox-exporter restart: always privileged: true volumes: - ./blackbox_exporter/:/etc/blackbox_exporter networks: monitor: aliases: - blackbox command: - '--config.file=/etc/blackbox_exporter/blackbox.yml' logging: driver: json-file options: max-file: '5' max-size: 50m networks: monitor: ipam: config: - subnet: 192.168.17.0/24
因为prometheus,alertmanager 自己不带认证功能,因此前端使用nginx完成调度和basic auth 认证,同一代理后端监听端口,便于管理。
prometheus: 9090 grafana:3000 alertmanager: 9093 node_exproter: 9100 cadvisor: 8080 (客户端)
echo monitor:`openssl passwd -crypt 123456` > .htpasswd
单独挂在配置文件容器不更新:(固然也能够选择挂在目录,而不是直接挂在文件)
chmod 666 nginx.conf
nginx容器加载配置文件:
docker exec -it web-director nginx -s reload
nginx.conf
[root@host40 monitor-bak]# cat nginx/nginx.conf user nginx; worker_processes auto; error_log /var/log/nginx/error.log; pid /run/nginx.pid; include /usr/share/nginx/modules/*.conf; events { worker_connections 10240; } http { log_format main '$remote_addr - $remote_user [$time_local] "$request" ' '$status $body_bytes_sent "$http_referer" ' '"$http_user_agent" "$http_x_forwarded_for"'; access_log /var/log/nginx/access.log main; sendfileon; tcp_nopush on; tcp_nodelayon; keepalive_timeout65; types_hash_max_size 2048; include /etc/nginx/mime.types; default_type application/octet-stream;
proxy_connect_timeout500ms;
proxy_send_timeout1000ms;
proxy_read_timeout3000ms;
proxy_buffers 64 8k;
proxy_busy_buffers_size 128k;
proxy_temp_file_write_size 64k;
proxy_redirect off;
proxy_next_upstream error invalid_header timeout http_502 http_504;
proxy_http_version 1.1;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Real-Port $remote_port;
proxy_set_header Host $http_host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
client_max_body_size 10m;
client_body_buffer_size 512k;
client_body_timeout 180;
client_header_timeout 10;
send_timeout 240;
gzip on;
gzip_min_length 1k;
gzip_buffers 4 16k;
gzip_comp_level 2;
gzip_types application/javascript application/x-javascript text/css text/javascript image/jpeg image/gif image/png;
gzip_vary off;
gzip_disable "MSIE [1-6].";
server {
listen 3000;
servername ;
location / {
proxy_pass http://grafana:3000;
}
}
server {
listen 9090;
servername ;
location / {
auth_basic "auth for monitor";
auth_basic_user_file /etc/nginx/basic_auth/.htpasswd;
proxy_pass http://prometheus:9090;
}
}
server {
listen 9093;
servername ;
location / {
auth_basic "auth for monitor";
auth_basic_user_file /etc/nginx/basic_auth/.htpasswd;
proxy_pass http://alertmanager:9093;
}
}
}
### 9.5 prometheus - 注意db目录需可写,给777权限 #### 9.5.1 主配置文件: prometheus.yml
[root@host40 monitor-bak]# cat prometheus/prometheus.yml
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting:
alertmanagers:
rule_files:
scrape_configs:
job=<job_name>
to any timeseries scraped from this config.job_name: 'prometheus'
static_configs:
job_name: 'node_real_lan'
file_sd_configs:
job_name: 'node_virtual_lan'
file_sd_configs:
job_name: 'node_real_wan'
file_sd_configs:
job_name: 'node_virtual_wan'
file_sd_configs:
ls prometheus/sd_files/ docker_host.yml http.yml icmp.yml real_lan.yml real_wan.yml sedFDm5Rw tcp.yml virtual_lan.yml virtual_wan.yml
cat prometheus/sd_files/docker_host.yml - targets: ['10.10.11.178:9080'] - targets: ['10.10.11.99:9080'] - targets: ['10.10.11.40:9080'] - targets: ['10.10.11.35:9080'] - targets: ['10.10.11.45:9080'] - targets: ['10.10.11.46:9080'] - targets: ['10.10.11.48:9080'] - targets: ['10.10.11.47:9080'] - targets: ['10.10.11.65:9081'] - targets: ['10.10.11.61:9080'] - targets: ['10.10.11.66:9080'] - targets: ['10.10.11.68:9080'] - targets: ['10.10.11.98:9080'] - targets: ['10.10.11.75:9080'] - targets: ['10.10.11.97:9080'] - targets: ['10.10.11.179:9080']
cat prometheus/sd_files/tcp.yml - targets: ['10.10.11.178:8001'] labels: server_name: http_download - targets: ['10.10.11.178:3307'] labels: server_name: xiaojing_db - targets: ['10.10.11.178:3001'] labels: server_name: test_web
cat prometheus/rules/docker_monitor.yml groups: - name: "container monitor" rules: - alert: "Container down: env1" expr: time() - container_last_seen{name="env1"} > 60 for: 30s labels: severity: critical annotations: summary: "Container down: {{$labels.instance}} name={{$labels.name}}"
tcp rules:
cat prometheus/rules/tcp_monitor.yml groups: - name: blackbox_network_stats rules:
alert: blackbox_network_stats
expr: probe_success == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} ,server-name: {{ $labels.server_name }} is down"
description: "链接不通..."
cat prometheus/rules/system_monitor.yml groups: - name: "system info" rules: - alert: "服务器宕机" expr: up == 0 for: 3m labels: severity: critical annotations: summary: "{{$labels.instance}}:服务器宕机" description: "{{$labels.instance}}:服务器没法链接,持续时间已超过3mins" - alert: "系统负载太高" expr: (node_load1/count without (cpu, mode) (node_cpu_seconds_total{mode="system"}))* on(instance) group_left( nodename) (node_uname_info) > 1.1 for: 3m labels: servirity: warning annotations: summary: "{{$labels.instance}}:系统负载太高" description: "{{$labels.instance}}:系统负载太高." value: "{{$value}}" - alert: "CPU 使用率超过90%" expr: 100-(avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 90 for: 3m labels: severity: critical annotations: summary: "{{$labels.instance}}:CPU 使用率90%" description: "{{$labels.instance}}:CPU 使用率超过90%." value: "{{$value}}" - alert: "内存使用率超过80%" expr: (100 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100)* on(instance) group_left( nodename) (node_uname_info) > 80 for: 3m labels: severity: critical annotations: summary: "{{$labels.instance}}:内存使用率80%" description: "{{$labels.instance}}:内存使用率超过80%" value: "{{$value}}" - alert: "IO操做耗时超过60%" expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 40 for: 3m labels: severity: critical annotations: summary: "{{$labels.instance}}:IO操做耗时超过60%" description: "{{$labels.instance}}:IO操做耗时超过60%" value: "{{$value}}" - alert: "磁盘分区容量超过85" expr: (100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) )* on(instance) group_left(nodename) (node_uname_info)> 85 for: 3m labels: severity: longtime annotations: summary: "{{$labels.instance}}:磁盘分区容量超过85%" description: "{{$labels.instance}}:磁盘分区容量超过85%" value: "{{$value}}" - alert: "磁盘将在4天后写满" expr: predict_linear(node_filesystem_free_bytes[2h], 4 * 24 * 3600) < 0 for: 3m labels: severity: longtime annotations: summary: "{{$labels.instance}}: 预计将有磁盘分区在4天后写满," description: "{{$labels.instance}}:预计将有磁盘分区在4天后写满," value: "{{$value}}"
注意db目录可写:
主配置文件:
cat alertmanager/alertmanager.yml global: resolve_timeout: 5m smtp_smarthost: 'smtphz.qiye.163.com:25' smtp_from: 'XXX@fosafer.com' smtp_auth_username: 'XXX@fosafer.com' smtp_auth_password: 'XXX' smtp_hello: 'qiye.163.com' smtp_require_tls: true route: group_by: ['instance'] group_wait: 30s receiver: default routes: - group_interval: 3m repeat_interval: 10m match: severiry: warning receiver: 'default' - group_interval: 3m repeat_interval: 30m match: severiry: critical receiver: 'default' - group_interval: 5m repeat_interval: 24h match: severiry: longtime receiver: 'default' templates:
name: 'default'
email_configs:
wechat_configs:
name: 'critical'
email_configs:
告警模板文件
cat alertmanager/templates/wechat.tmpl {{ define "wechat.html" }} {{- if gt (len .Alerts.Firing) 0 -}}{{ range .Alerts }} [@警报~] 实例: {{ .Labels.instance }} 信息: {{ .Annotations.summary }} 详情: {{ .Annotations.description }} 值: {{ .Annotations.value }} 时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} {{ end }}{{ end -}} {{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }} [@恢复~] 实例: {{ .Labels.instance }} 信息: {{ .Annotations.summary }} 时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} 恢复: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} {{ end }}{{ end -}} {{- end }}
安装脚本:
http://10.10.11.178:8001/node_exporter_install.sh
安装脚本:
http://10.10.11.178:8001/node_exporter_install_docker.sh
须要的image,对于没有添加10.10.11.40:80 仓库的docker主机,能够下载save的image,先load image 在安装
http://10.10.11.178:8001/monitor-client.tgz
全部的job都使用基于文件的服务发现,因此,只用将target写入sd_file便可,无需重读配置文件
基于此写了一个文本处理脚本做为sd_files的前端,经过命令行的形式添加和删除targets,无需手动编辑文件
脚本名称: sd_controler.sh
脚本使用:./sd_controler.sh 便可查看usage
完整脚本以下:
[root@host40 monitor]# cat sd_controler.sh #!/bin/bash #version: 1.0 #Description: add | del | show instance from|to prometheus file_sd_files. # rl | vl | dk | rw | vw | tcp | http | icmp : short for job name, each one means a sd_file. # tcp | http | icmp ( because with ports for service ) add with label (server_name by default) to easy read in alert emails. # each time can only add|del for one instance. #说明:用来添加、删除、查看prometheus基于文件的服务发现中的条目。好比IP:PORT 组合。 # rl | vl | dk | rw | vw | tcp | http | icmp :这写prometheus job名称的简称,每一项表明一个job,操做一个sd_file 即job文件服务发现使用的文件。 # tcp | http | icmp,因为经常没法根据服务端口第一时间确认挂掉的是什么服务,因此,在tcp http icmp(顺带)添加的时候要求带上server_name的标签label, #让监控人员收到告警邮件第十时间知道挂掉的是什么服务。 # 每一次只能添加、删除一条记录,若是须要批量添加,能够直接使用vim 文本操做,或者写for 语句批量执行。 ### vars SD_DIR=./prometheus/sd_files DOCKER_SD=$SD_DIR/docker_host.yml RL_HOST_SD=$SD_DIR/real_lan.yml VL_HOST_SD=$SD_DIR/virtual_lan.yml RW_HOST_SD=$SD_DIR/real_wan.yml VW_HOST_SD=$SD_DIR/virtual_wan.yml TCP_SD=$SD_DIR/tcp.yml HTTP_SD=$SD_DIR/http.yml ICMP_SD=$SD_DIR/icmp.yml SDFILE= ### funcs usage(){ echo -e "Usage: $0 < rl | vl | dk | rw | vw | tcp | http | icmp > < add | del | show > [ IP:PORT | FQDN ] [ server-name ]" echo -e " example: \n\t node add:\t $0 rl add | del 10.10.10.10:9100\n\t tcp,http,icmp add:\t $0 tcp add 10.10.10.10:3306 web-mysql\n\t del:\t $0 http del www.baidu.com\n\t show:\t $0 rl | vl | dk | rw | vw | tcp | http | icmp show." exit } add(){ # $1: SDFILE, $2: IP:PORT grep -q $2 $1 || echo -e "- targets: ['$2']" >> $1 } del(){ # $1: SDFILE, $2: IP:PORT sed -i '/'$2'/d' $1 } add_with_label(){ # $1: SDFILE, $2: [IP:[PROT]|FQDN] $3:SERVER-NAME LABEL_01="server_name" if ! grep -q '$2' $1;then echo -e "- targets: ['$2']" >> $1 echo -e " labels:" >> $1 echo -e " ${LABEL_01}: $3" >> $1 fi } del_with_label(){ # $1: SDFILE, $2: [IP:[PROT]|FQDN] NUM=`cat -n $SDFILE |grep "'$2'"|awk '{print $1}'` let ENDNUM=NUM+2
sed -i $NUM,${ENDNUM}d $1
}
action(){
if [ "$1" == "add" ];then
add $SDFILE $2
elif [ "$1" == "del" ];then
del $SDFILE $2
elif [ "$1" == "show" ];then
cat $SDFILE
fi
}
action_with_label(){
if [ "$1" == "add" ];then
add_with_label $SDFILE $2 $3
elif [ "$1" == "del" ];then
del_with_label $SDFILE $2 $3
elif [ "$1" == "show" ];then
cat $SDFILE
fi
}
[ "$2" == "" ] || [[ ! "$2" =~ ^(add|del|show)$ ]] && usage
curl --version &>/dev/null || { echo -e "no curl found. " && exit 15; }
if [[ $1 =~ ^(rl|vl|rw|vw|dk)$ ]] && [ "$2" == "add" ];then
[ "$3" == "" ] && usage
if [ "$4" != "-f" ];then
COOD=curl -IL -o /dev/null --retry 3 --connect-timeout 3 -s -w "%{http_code}" http://$3/metrics
[ "$COOD" != "200" ] && echo -e "http://$3/metrics is not arriable. check it again. or you can use -f to ignor it." && exit 11
fi
fi
if [[ $1 =~ ^(tcp|http|icmp)$ ]] && [ "$2" == "add" ];then
[ "$4" == "" ] && echo -e "监听 tcp http icmp 服务时必须指明 server-name." && usage
fi
case $1 inrl)SDFILE=$RL_HOST_SDaction $2 $3 && echo $2 OK;;vl)SDFILE=$VL_HOST_SDaction $2 $3 && echo $2 OK;;dk)SDFILE=$DOCKER_SDaction $2 $3 && echo $2 OK;;rw)SDFILE=$RW_HOST_SDaction $2 $3 && echo $2 OK;;vw)SDFILE=$VW_HOST_SDaction $2 $3 && echo $2 OK;;tcp)SDFILE=$TCP_SDaction_with_label $2 $3 $4 && echo $2 OK;;http)SDFILE=$HTTP_SDaction_with_label $2 $3 $4 && echo $2 OK;; icmp)SDFILE=$ICMP_SDaction_with_label $2 $3 $4 && echo $2 OK;; *)usage;;esac