initial commit
This commit is contained in:
70
prometheus/alert.rules
Normal file
70
prometheus/alert.rules
Normal file
@@ -0,0 +1,70 @@
|
||||
groups:
|
||||
- name: targets
|
||||
rules:
|
||||
- alert: monitor_service_down
|
||||
expr: up == 0
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Monitor service non-operational"
|
||||
description: "Service {{ $labels.instance }} is down."
|
||||
|
||||
- name: host
|
||||
rules:
|
||||
- alert: high_cpu_load
|
||||
expr: node_load1 > 1.5
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Server under high load"
|
||||
description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
|
||||
- alert: high_memory_load
|
||||
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Server memory is almost full"
|
||||
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
|
||||
- alert: high_storage_load
|
||||
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Server storage is almost full"
|
||||
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
|
||||
- name: containers
|
||||
rules:
|
||||
- alert: jenkins_down
|
||||
expr: absent(container_memory_usage_bytes{name="jenkins"})
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Jenkins down"
|
||||
description: "Jenkins container is down for more than 30 seconds."
|
||||
|
||||
- alert: jenkins_high_cpu
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Jenkins high CPU usage"
|
||||
description: "Jenkins CPU usage is {{ humanize $value}}%."
|
||||
|
||||
- alert: jenkins_high_memory
|
||||
expr: sum(container_memory_usage_bytes{name="jenkins"}) > 1200000000
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Jenkins high memory usage"
|
||||
description: "Jenkins memory consumption is at {{ humanize $value}}."
|
||||
|
53
prometheus/prometheus.yml
Normal file
53
prometheus/prometheus.yml
Normal file
@@ -0,0 +1,53 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
# Attach these labels to any time series or alerts when communicating with
|
||||
# external systems (federation, remote storage, Alertmanager).
|
||||
external_labels:
|
||||
monitor: 'docker-host-alpha'
|
||||
|
||||
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
|
||||
rule_files:
|
||||
- "alert.rules"
|
||||
|
||||
# A scrape configuration containing exactly one endpoint to scrape.
|
||||
scrape_configs:
|
||||
- job_name: 'nodeexporter'
|
||||
scrape_interval: 5s
|
||||
static_configs:
|
||||
- targets: ['nodeexporter:9100']
|
||||
|
||||
- job_name: 'cadvisor'
|
||||
scrape_interval: 5s
|
||||
static_configs:
|
||||
- targets: ['cadvisor:8080']
|
||||
|
||||
- job_name: 'prometheus'
|
||||
scrape_interval: 10s
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: 'pushgateway'
|
||||
scrape_interval: 10s
|
||||
honor_labels: true
|
||||
static_configs:
|
||||
- targets: ['pushgateway:9091']
|
||||
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- scheme: http
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'alertmanager:9093'
|
||||
|
||||
# - job_name: 'nginx'
|
||||
# scrape_interval: 10s
|
||||
# static_configs:
|
||||
# - targets: ['nginxexporter:9113']
|
||||
|
||||
# - job_name: 'aspnetcore'
|
||||
# scrape_interval: 10s
|
||||
# static_configs:
|
||||
# - targets: ['eventlog-proxy:5000', 'eventlog:5000']
|
Reference in New Issue
Block a user