docker-compose-monitor.yml

version: \'2\'

networks:
  monitor:
driver: bridge

services:
  influxdb:
image: influxdb:latest
container_name: tig-influxdb
ports:
  - \"18083:8083\"
  - \"18086:8086\"
  - \"18090:8090\"
env_file:
  - \'env.influxdb\'
volumes:
  # Data persistency
  # sudo mkdir -p ./influxdb/data
  - ./influxdb/data:/var/lib/influxdb
  # 配置docker里的时间为东八区时间
  - ./timezone:/etc/timezone:ro
  - ./localtime:/etc/localtime:ro
restart: unless-stopped #停止后自动

  telegraf:
image: telegraf:latest
container_name: tig-telegraf
links:
  - influxdb
volumes:
  - ./telegraf.conf:/etc/telegraf/telegraf.conf:ro
  - ./timezone:/etc/timezone:ro
  - ./localtime:/etc/localtime:ro
restart: unless-stopped
  prometheus:
image: prom/prometheus
container_name: prometheus
hostname: prometheus
restart: always
volumes:
  - /home/qa/docker/grafana/prometheus.yml:/etc/prometheus/prometheus.yml
  - /home/qa/docker/grafana/node_down.yml:/etc/prometheus/node_down.yml
ports:
  - \'9090:9090\'
networks:
  - monitor

  alertmanager:
image: prom/alertmanager
container_name: alertmanager
hostname: alertmanager
restart: always
volumes:
  - /home/qa/docker/grafana/alertmanager.yml:/etc/alertmanager/alertmanager.yml
ports:
  - \'9093:9093\'
networks:
  - monitor

  grafana:
image: grafana/grafana:6.7.4
container_name: grafana
hostname: grafana
restart: always
ports:
  - \'13000:3000\'
networks:
  - monitor

  node-exporter:
image: quay.io/prometheus/node-exporter
container_name: node-exporter
hostname: node-exporter
restart: always
ports:
  - \'9100:9100\'
networks:
  - monitor

  cadvisor:
image: google/cadvisor:latest
container_name: cadvisor
hostname: cadvisor
restart: always
volumes:
  - /:/rootfs:ro
  - /var/run:/var/run:rw
  - /sys:/sys:ro
  - /var/lib/docker/:/var/lib/docker:ro
ports:
  - \'18080:8080\'
networks:
  - monitor

alertmanager.yml

global:
  resolve_timeout: 5m
  smtp_from: \'邮箱\'
  smtp_smarthost: \'smtp.exmail.qq.com:25\'
  smtp_auth_username: \'邮箱\'
  smtp_auth_password: \'密码\'
  smtp_require_tls: false
  smtp_hello: \'qq.com\'
route:
  group_by: [\'alertname\']
  group_wait: 5s
  group_interval: 5s
  repeat_interval: 5m
  receiver: \'email\'
receivers:
- name: \'email\'
  email_configs:
  - to: \'收件邮箱\'
send_resolved: true
inhibit_rules:
  - source_match:
  severity: \'critical\'
target_match:
  severity: \'warning\'
equal: [\'alertname\', \'dev\', \'instance\']

prometheus.yml

global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
- targets: [\'192.168.32.117:9093\']
  # - alertmanager:9093

# Load rules once and periodically evaluate them according to the global \'evaluation_interval\'.
rule_files:
  - \"node_down.yml\"
  # - \"node-exporter-alert-rules.yml\"
  # - \"first_rules.yml\"
  # - \"second_rules.yml\"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it\'s Prometheus itself.
scrape_configs:
  # IO存储节点组
  - job_name: \'io\'
scrape_interval: 8s
static_configs:     #端口为node-exporter启动的端口 
  - targets: [\'192.168.32.117:9100\']
  - targets: [\'192.168.32.196:9100\']
  - targets: [\'192.168.32.136:9100\']
  - targets: [\'192.168.32.193:9100\']
  - targets: [\'192.168.32.153:9100\']
  - targets: [\'192.168.32.185:9100\']
  - targets: [\'192.168.32.190:19100\']
  - targets: [\'192.168.32.192:9100\']

  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: \'cadvisor\'
static_configs:     #端口为cadvisor启动的端口
  - targets: [\'192.168.32.117:18080\']
  - targets: [\'192.168.32.193:8080\']
  - targets: [\'192.168.32.153:8080\']
  - targets: [\'192.168.32.185:8080\']
  - targets: [\'192.168.32.190:18080\']
  - targets: [\'192.168.32.192:18080\']

node_down.yml

groups:
  - name: node_down
rules:
  - alert: InstanceDown
expr: up == 0
for: 1m
labels:
  user: test
annotations:
  summary: \'Instance {{ $labels.instance }} down\'
  description: \'{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes.\'

#剩余内存小于10%
  - alert: 剩余内存小于10%
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
  severity: warning
annotations:
  summary: Host out of memory (instance {{ $labels.instance }})
  description: \"Node memory is filling up (< 10% left)\\n  VALUE = {{ $value }}\\n  LABELS = {{ $labels }}\"

#剩余磁盘小于10%
  - alert: 剩余磁盘小于10%
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
  severity: warning
annotations:
  summary: Host out of disk space (instance {{ $labels.instance }})
  description: \"Disk is almost full (< 10% left)\\n  VALUE = {{ $value }}\\n  LABELS = {{ $labels }}\"

#cpu负载 > 80%
  - alert: CPU负载 > 80%
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100) > 80
for: 0m
labels:
  severity: warning
annotations:
  summary: Host high CPU load (instance {{ $labels.instance }})
  description: \"CPU load is > 80%\\n  VALUE = {{ $value }}\\n  LABELS = {{ $labels }}\"

告警:https://awesome-prometheus-alerts.grep.to/rules#prometheus-self-monitoring

官网仪表盘:https://grafana.com/grafana/dashboards/

到此这篇关于docker部署grafana+prometheus配置的文章就介绍到这了,更多相关docker部署grafana+prometheus内容请搜索主题屋以前的文章或继续浏览下面的相关文章希望大家以后多多支持主题屋!

发表回复

您的电子邮箱地址不会被公开。