initial commit
This commit is contained in:
commit
3caac51ca1
|
@ -0,0 +1,63 @@
|
|||
###############################################################################
|
||||
# Set default behavior to automatically normalize line endings.
|
||||
###############################################################################
|
||||
* text eol=lf
|
||||
|
||||
###############################################################################
|
||||
# Set default behavior for command prompt diff.
|
||||
#
|
||||
# This is need for earlier builds of msysgit that does not have it on by
|
||||
# default for csharp files.
|
||||
# Note: This is only used by command line
|
||||
###############################################################################
|
||||
#*.cs diff=csharp
|
||||
|
||||
###############################################################################
|
||||
# Set the merge driver for project and solution files
|
||||
#
|
||||
# Merging from the command prompt will add diff markers to the files if there
|
||||
# are conflicts (Merging from VS is not affected by the settings below, in VS
|
||||
# the diff markers are never inserted). Diff markers may cause the following
|
||||
# file extensions to fail to load in VS. An alternative would be to treat
|
||||
# these files as binary and thus will always conflict and require user
|
||||
# intervention with every merge. To do so, just uncomment the entries below
|
||||
###############################################################################
|
||||
#*.sln merge=binary
|
||||
#*.csproj merge=binary
|
||||
#*.vbproj merge=binary
|
||||
#*.vcxproj merge=binary
|
||||
#*.vcproj merge=binary
|
||||
#*.dbproj merge=binary
|
||||
#*.fsproj merge=binary
|
||||
#*.lsproj merge=binary
|
||||
#*.wixproj merge=binary
|
||||
#*.modelproj merge=binary
|
||||
#*.sqlproj merge=binary
|
||||
#*.wwaproj merge=binary
|
||||
|
||||
###############################################################################
|
||||
# behavior for image files
|
||||
#
|
||||
# image files are treated as binary by default.
|
||||
###############################################################################
|
||||
*.jpg binary
|
||||
*.png binary
|
||||
*.gif binary
|
||||
|
||||
###############################################################################
|
||||
# diff behavior for common document formats
|
||||
#
|
||||
# Convert binary document formats to text before diffing them. This feature
|
||||
# is only available from the command line. Turn it on by uncommenting the
|
||||
# entries below.
|
||||
###############################################################################
|
||||
#*.doc diff=astextplain
|
||||
#*.DOC diff=astextplain
|
||||
#*.docx diff=astextplain
|
||||
#*.DOCX diff=astextplain
|
||||
#*.dot diff=astextplain
|
||||
#*.DOT diff=astextplain
|
||||
#*.pdf diff=astextplain
|
||||
#*.PDF diff=astextplain
|
||||
#*.rtf diff=astextplain
|
||||
#*.RTF diff=astextplain
|
|
@ -0,0 +1,4 @@
|
|||
*DS_Store
|
||||
.vs/
|
||||
.vscode/
|
||||
.idea/
|
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2016 Stefan Prodan
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
|
@ -0,0 +1,363 @@
|
|||
Prometheus-Grafana
|
||||
========
|
||||
|
||||
A monitoring solution for Docker hosts and containers with [Prometheus](https://prometheus.io/), [Grafana](http://grafana.org/), [cAdvisor](https://github.com/google/cadvisor),
|
||||
[NodeExporter](https://github.com/prometheus/node_exporter) and alerting with [AlertManager](https://github.com/prometheus/alertmanager).
|
||||
|
||||
This is a forked repository. So, you may want to visit the original repo at [stefanprodan
|
||||
/
|
||||
dockprom](https://github.com/stefanprodan/dockprom)
|
||||
|
||||
Additional info: [Docker - Prometheus and Grafana](https://bogotobogo.com/DevOps/Docker/Docker_Prometheus_Grafana.php)
|
||||
|
||||
## Install
|
||||
|
||||
### Create .env:
|
||||
```
|
||||
ADMIN_USER=admin
|
||||
ADMIN_PASSWORD=admin
|
||||
```
|
||||
|
||||
### Clone this repository on your Docker host, cd into test directory and run compose up:
|
||||
|
||||
```
|
||||
git clone https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana.git
|
||||
cd Docker-Compose-Prometheus-and-Grafana
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
## Prerequisites:
|
||||
|
||||
* Docker Engine >= 1.13
|
||||
* Docker Compose >= 1.11
|
||||
|
||||
## Containers:
|
||||
|
||||
* Prometheus (metrics database) `http://<host-ip>:9090`
|
||||
* Prometheus-Pushgateway (push acceptor for ephemeral and batch jobs) `http://<host-ip>:9091`
|
||||
* AlertManager (alerts management) `http://<host-ip>:9093`
|
||||
* Grafana (visualize metrics) `http://<host-ip>:3000`
|
||||
* NodeExporter (host metrics collector)
|
||||
* cAdvisor (containers metrics collector)
|
||||
* Caddy (reverse proxy and basic auth provider for prometheus and alertmanager)
|
||||
|
||||
## Setup Grafana
|
||||
|
||||
Navigate to `http://<host-ip>:3000` and login with user ***admin*** password ***admin***. You can change the credentials in the compose file or by supplying the `ADMIN_USER` and `ADMIN_PASSWORD` environment variables via .env file on compose up. The config file can be added directly in grafana part like this
|
||||
```
|
||||
grafana:
|
||||
image: grafana/grafana:5.2.4
|
||||
env_file:
|
||||
- config
|
||||
|
||||
```
|
||||
and the config file format should have this content
|
||||
```
|
||||
GF_SECURITY_ADMIN_USER=admin
|
||||
GF_SECURITY_ADMIN_PASSWORD=changeme
|
||||
GF_USERS_ALLOW_SIGN_UP=false
|
||||
```
|
||||
If you want to change the password, you have to remove this entry, otherwise the change will not take effect
|
||||
```
|
||||
- grafana_data:/var/lib/grafana
|
||||
```
|
||||
|
||||
Grafana is preconfigured with dashboards and Prometheus as the default data source:
|
||||
|
||||
* Name: Prometheus
|
||||
* Type: Prometheus
|
||||
* Url: http://prometheus:9090
|
||||
* Access: proxy
|
||||
|
||||
***Docker Host Dashboard***
|
||||
|
||||
![Host](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Grafana_Docker_Host.png)
|
||||
|
||||
The Docker Host Dashboard shows key metrics for monitoring the resource usage of your server:
|
||||
|
||||
* Server uptime, CPU idle percent, number of CPU cores, available memory, swap and storage
|
||||
* System load average graph, running and blocked by IO processes graph, interrupts graph
|
||||
* CPU usage graph by mode (guest, idle, iowait, irq, nice, softirq, steal, system, user)
|
||||
* Memory usage graph by distribution (used, free, buffers, cached)
|
||||
* IO usage graph (read Bps, read Bps and IO time)
|
||||
* Network usage graph by device (inbound Bps, Outbound Bps)
|
||||
* Swap usage and activity graphs
|
||||
|
||||
For storage and particularly Free Storage graph, you have to specify the fstype in grafana graph request.
|
||||
You can find it in `grafana/dashboards/docker_host.json`, at line 480 :
|
||||
|
||||
"expr": "sum(node_filesystem_free_bytes{fstype=\"btrfs\"})",
|
||||
|
||||
I work on BTRFS, so i need to change `aufs` to `btrfs`.
|
||||
|
||||
You can find right value for your system in Prometheus `http://<host-ip>:9090` launching this request :
|
||||
|
||||
node_filesystem_free_bytes
|
||||
|
||||
***Docker Containers Dashboard***
|
||||
|
||||
![Containers](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Grafana_Docker_Containers.png)
|
||||
|
||||
The Docker Containers Dashboard shows key metrics for monitoring running containers:
|
||||
|
||||
* Total containers CPU load, memory and storage usage
|
||||
* Running containers graph, system load graph, IO usage graph
|
||||
* Container CPU usage graph
|
||||
* Container memory usage graph
|
||||
* Container cached memory usage graph
|
||||
* Container network inbound usage graph
|
||||
* Container network outbound usage graph
|
||||
|
||||
Note that this dashboard doesn't show the containers that are part of the monitoring stack.
|
||||
|
||||
***Monitor Services Dashboard***
|
||||
|
||||
![Monitor Services](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Grafana_Prometheus.png)
|
||||
![Monitor Services](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Grafana_Prometheus2.png)
|
||||
![Monitor Services](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Grafana_Prometheus3.png)
|
||||
|
||||
The Monitor Services Dashboard shows key metrics for monitoring the containers that make up the monitoring stack:
|
||||
|
||||
* Prometheus container uptime, monitoring stack total memory usage, Prometheus local storage memory chunks and series
|
||||
* Container CPU usage graph
|
||||
* Container memory usage graph
|
||||
* Prometheus chunks to persist and persistence urgency graphs
|
||||
* Prometheus chunks ops and checkpoint duration graphs
|
||||
* Prometheus samples ingested rate, target scrapes and scrape duration graphs
|
||||
* Prometheus HTTP requests graph
|
||||
* Prometheus alerts graph
|
||||
|
||||
## Define alerts
|
||||
|
||||
Three alert groups have been setup within the [alert.rules](https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/blob/master/prometheus/alert.rules) configuration file:
|
||||
|
||||
* Monitoring services alerts [targets](https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/blob/master/prometheus/alert.rules#L2-L11)
|
||||
* Docker Host alerts [host](https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/blob/master/prometheus/alert.rules#L13-L40)
|
||||
* Docker Containers alerts [containers](https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/blob/master/prometheus/alert.rules#L42-L69)
|
||||
|
||||
You can modify the alert rules and reload them by making a HTTP POST call to Prometheus:
|
||||
|
||||
```
|
||||
curl -X POST http://admin:admin@<host-ip>:9090/-/reload
|
||||
```
|
||||
|
||||
***Monitoring services alerts***
|
||||
|
||||
Trigger an alert if any of the monitoring targets (node-exporter and cAdvisor) are down for more than 30 seconds:
|
||||
|
||||
```yaml
|
||||
- alert: monitor_service_down
|
||||
expr: up == 0
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Monitor service non-operational"
|
||||
description: "Service {{ $labels.instance }} is down."
|
||||
```
|
||||
|
||||
***Docker Host alerts***
|
||||
|
||||
Trigger an alert if the Docker host CPU is under high load for more than 30 seconds:
|
||||
|
||||
```yaml
|
||||
- alert: high_cpu_load
|
||||
expr: node_load1 > 1.5
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Server under high load"
|
||||
description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
```
|
||||
|
||||
Modify the load threshold based on your CPU cores.
|
||||
|
||||
Trigger an alert if the Docker host memory is almost full:
|
||||
|
||||
```yaml
|
||||
- alert: high_memory_load
|
||||
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Server memory is almost full"
|
||||
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
```
|
||||
|
||||
Trigger an alert if the Docker host storage is almost full:
|
||||
|
||||
```yaml
|
||||
- alert: high_storage_load
|
||||
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Server storage is almost full"
|
||||
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
```
|
||||
|
||||
***Docker Containers alerts***
|
||||
|
||||
Trigger an alert if a container is down for more than 30 seconds:
|
||||
|
||||
```yaml
|
||||
- alert: jenkins_down
|
||||
expr: absent(container_memory_usage_bytes{name="jenkins"})
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Jenkins down"
|
||||
description: "Jenkins container is down for more than 30 seconds."
|
||||
```
|
||||
|
||||
Trigger an alert if a container is using more than 10% of total CPU cores for more than 30 seconds:
|
||||
|
||||
```yaml
|
||||
- alert: jenkins_high_cpu
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Jenkins high CPU usage"
|
||||
description: "Jenkins CPU usage is {{ humanize $value}}%."
|
||||
```
|
||||
|
||||
Trigger an alert if a container is using more than 1.2GB of RAM for more than 30 seconds:
|
||||
|
||||
```yaml
|
||||
- alert: jenkins_high_memory
|
||||
expr: sum(container_memory_usage_bytes{name="jenkins"}) > 1200000000
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Jenkins high memory usage"
|
||||
description: "Jenkins memory consumption is at {{ humanize $value}}."
|
||||
```
|
||||
|
||||
## Setup alerting
|
||||
|
||||
The AlertManager service is responsible for handling alerts sent by Prometheus server.
|
||||
AlertManager can send notifications via email, Pushover, Slack, HipChat or any other system that exposes a webhook interface.
|
||||
A complete list of integrations can be found [here](https://prometheus.io/docs/alerting/configuration).
|
||||
|
||||
You can view and silence notifications by accessing `http://<host-ip>:9093`.
|
||||
|
||||
The notification receivers can be configured in [alertmanager/config.yml](https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/blob/master/alertmanager/config.yml) file.
|
||||
|
||||
To receive alerts via Slack you need to make a custom integration by choose ***incoming web hooks*** in your Slack team app page.
|
||||
You can find more details on setting up Slack integration [here](http://www.robustperception.io/using-slack-with-the-alertmanager/).
|
||||
|
||||
Copy the Slack Webhook URL into the ***api_url*** field and specify a Slack ***channel***.
|
||||
|
||||
```yaml
|
||||
route:
|
||||
receiver: 'slack'
|
||||
|
||||
receivers:
|
||||
- name: 'slack'
|
||||
slack_configs:
|
||||
- send_resolved: true
|
||||
text: "{{ .CommonAnnotations.description }}"
|
||||
username: 'Prometheus'
|
||||
channel: '#<channel>'
|
||||
api_url: 'https://hooks.slack.com/services/<webhook-id>'
|
||||
```
|
||||
|
||||
![Slack Notifications](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Slack_Notifications.png)
|
||||
|
||||
## Sending metrics to the Pushgateway
|
||||
|
||||
The [pushgateway](https://github.com/prometheus/pushgateway) is used to collect data from batch jobs or from services.
|
||||
|
||||
To push data, simply execute:
|
||||
|
||||
echo "some_metric 3.14" | curl --data-binary @- http://user:password@localhost:9091/metrics/job/some_job
|
||||
|
||||
Please replace the `user:password` part with your user and password set in the initial configuration (default: `admin:admin`).
|
||||
|
||||
## Updating Grafana to v5.2.2
|
||||
|
||||
[In Grafana versions >= 5.1 the id of the grafana user has been changed](http://docs.grafana.org/installation/docker/#migration-from-a-previous-version-of-the-docker-container-to-5-1-or-later). Unfortunately this means that files created prior to 5.1 won’t have the correct permissions for later versions.
|
||||
|
||||
| Version | User | User ID |
|
||||
|:-------:|:-------:|:-------:|
|
||||
| < 5.1 | grafana | 104 |
|
||||
| \>= 5.1 | grafana | 472 |
|
||||
|
||||
There are two possible solutions to this problem.
|
||||
- Change ownership from 104 to 472
|
||||
- Start the upgraded container as user 104
|
||||
|
||||
##### Specifying a user in docker-compose.yml
|
||||
|
||||
To change ownership of the files run your grafana container as root and modify the permissions.
|
||||
|
||||
First perform a `docker-compose down` then modify your docker-compose.yml to include the `user: root` option:
|
||||
|
||||
```
|
||||
grafana:
|
||||
image: grafana/grafana:5.2.2
|
||||
container_name: grafana
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- ./grafana/datasources:/etc/grafana/datasources
|
||||
- ./grafana/dashboards:/etc/grafana/dashboards
|
||||
- ./grafana/setup.sh:/setup.sh
|
||||
entrypoint: /setup.sh
|
||||
user: root
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin}
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin}
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
restart: unless-stopped
|
||||
expose:
|
||||
- 3000
|
||||
networks:
|
||||
- monitor-net
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
```
|
||||
|
||||
Perform a `docker-compose up -d` and then issue the following commands:
|
||||
|
||||
```
|
||||
docker exec -it --user root grafana bash
|
||||
|
||||
# in the container you just started:
|
||||
chown -R root:root /etc/grafana && \
|
||||
chmod -R a+r /etc/grafana && \
|
||||
chown -R grafana:grafana /var/lib/grafana && \
|
||||
chown -R grafana:grafana /usr/share/grafana
|
||||
```
|
||||
|
||||
To run the grafana container as `user: 104` change your `docker-compose.yml` like such:
|
||||
|
||||
```
|
||||
grafana:
|
||||
image: grafana/grafana:5.2.2
|
||||
container_name: grafana
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- ./grafana/datasources:/etc/grafana/datasources
|
||||
- ./grafana/dashboards:/etc/grafana/dashboards
|
||||
- ./grafana/setup.sh:/setup.sh
|
||||
entrypoint: /setup.sh
|
||||
user: "104"
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin}
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin}
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
restart: unless-stopped
|
||||
expose:
|
||||
- 3000
|
||||
networks:
|
||||
- monitor-net
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
```
|
|
@ -0,0 +1,11 @@
|
|||
route:
|
||||
receiver: 'slack'
|
||||
|
||||
receivers:
|
||||
- name: 'slack'
|
||||
slack_configs:
|
||||
- send_resolved: true
|
||||
text: "{{ .CommonAnnotations.description }}"
|
||||
username: 'Prometheus'
|
||||
channel: '#prometheus'
|
||||
api_url: 'https://hooks.slack.com/services/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'
|
|
@ -0,0 +1,39 @@
|
|||
:9090 {
|
||||
basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD}
|
||||
proxy / prometheus:9090 {
|
||||
transparent
|
||||
}
|
||||
|
||||
errors stderr
|
||||
tls off
|
||||
}
|
||||
|
||||
:9093 {
|
||||
basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD}
|
||||
proxy / alertmanager:9093 {
|
||||
transparent
|
||||
}
|
||||
|
||||
errors stderr
|
||||
tls off
|
||||
}
|
||||
|
||||
:9091 {
|
||||
basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD}
|
||||
proxy / pushgateway:9091 {
|
||||
transparent
|
||||
}
|
||||
|
||||
errors stderr
|
||||
tls off
|
||||
}
|
||||
|
||||
:3000 {
|
||||
proxy / grafana:3000 {
|
||||
transparent
|
||||
websocket
|
||||
}
|
||||
|
||||
errors stderr
|
||||
tls off
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
GF_SECURITY_ADMIN_USER=admin
|
||||
GF_SECURITY_ADMIN_PASSWORD=changeme
|
||||
GF_USERS_ALLOW_SIGN_UP=false
|
|
@ -0,0 +1,36 @@
|
|||
version: '2.1'
|
||||
|
||||
services:
|
||||
|
||||
nodeexporter:
|
||||
image: prom/node-exporter:v0.18.1
|
||||
container_name: nodeexporter
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.rootfs=/rootfs'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
restart: unless-stopped
|
||||
network_mode: host
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/google-containers/cadvisor:v0.34.0
|
||||
container_name: cadvisor
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:rw
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /cgroup:/cgroup:ro
|
||||
restart: unless-stopped
|
||||
network_mode: host
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
|
||||
|
|
@ -0,0 +1,133 @@
|
|||
version: '2.1'
|
||||
|
||||
networks:
|
||||
monitor-net:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
prometheus_data: {}
|
||||
grafana_data: {}
|
||||
|
||||
services:
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.17.1
|
||||
container_name: prometheus
|
||||
volumes:
|
||||
- ./prometheus:/etc/prometheus
|
||||
- prometheus_data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
||||
- '--web.console.templates=/etc/prometheus/consoles'
|
||||
- '--storage.tsdb.retention.time=200h'
|
||||
- '--web.enable-lifecycle'
|
||||
restart: unless-stopped
|
||||
expose:
|
||||
- 9090
|
||||
networks:
|
||||
- monitor-net
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:v0.20.0
|
||||
container_name: alertmanager
|
||||
volumes:
|
||||
- ./alertmanager:/etc/alertmanager
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/config.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
restart: unless-stopped
|
||||
expose:
|
||||
- 9093
|
||||
networks:
|
||||
- monitor-net
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
|
||||
nodeexporter:
|
||||
image: prom/node-exporter:v0.18.1
|
||||
container_name: nodeexporter
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.rootfs=/rootfs'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
restart: unless-stopped
|
||||
expose:
|
||||
- 9100
|
||||
networks:
|
||||
- monitor-net
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/google-containers/cadvisor:v0.34.0
|
||||
container_name: cadvisor
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:rw
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker:/var/lib/docker:ro
|
||||
#- /cgroup:/cgroup:ro #doesn't work on MacOS only for Linux
|
||||
restart: unless-stopped
|
||||
expose:
|
||||
- 8080
|
||||
networks:
|
||||
- monitor-net
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:6.7.2
|
||||
container_name: grafana
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- ./grafana/provisioning:/etc/grafana/provisioning
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=${ADMIN_USER}
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD}
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
restart: unless-stopped
|
||||
expose:
|
||||
- 3000
|
||||
networks:
|
||||
- monitor-net
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
|
||||
pushgateway:
|
||||
image: prom/pushgateway:v1.2.0
|
||||
container_name: pushgateway
|
||||
restart: unless-stopped
|
||||
expose:
|
||||
- 9091
|
||||
networks:
|
||||
- monitor-net
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
|
||||
caddy:
|
||||
image: stefanprodan/caddy
|
||||
container_name: caddy
|
||||
ports:
|
||||
- "3000:3000"
|
||||
- "9090:9090"
|
||||
- "9093:9093"
|
||||
- "9091:9091"
|
||||
volumes:
|
||||
- ./caddy:/etc/caddy
|
||||
environment:
|
||||
- ADMIN_USER=${ADMIN_USER}
|
||||
- ADMIN_PASSWORD=${ADMIN_PASSWORD}
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- monitor-net
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
|
@ -0,0 +1,12 @@
|
|||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Prometheus'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /etc/grafana/provisioning/dashboards
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,398 @@
|
|||
{
|
||||
"id": null,
|
||||
"title": "Nginx",
|
||||
"description": "Nginx exporter metrics",
|
||||
"tags": [
|
||||
"nginx"
|
||||
],
|
||||
"style": "dark",
|
||||
"timezone": "browser",
|
||||
"editable": true,
|
||||
"hideControls": false,
|
||||
"sharedCrosshair": true,
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"editable": true,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"datasource": "Prometheus",
|
||||
"decimals": 2,
|
||||
"editable": true,
|
||||
"error": false,
|
||||
"fill": 1,
|
||||
"grid": {
|
||||
"threshold1": null,
|
||||
"threshold1Color": "rgba(216, 200, 27, 0.27)",
|
||||
"threshold2": null,
|
||||
"threshold2Color": "rgba(234, 112, 112, 0.22)"
|
||||
},
|
||||
"id": 3,
|
||||
"isNew": true,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"links": [],
|
||||
"nullPointMode": "connected",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(irate(nginx_connections_processed_total{stage=\"any\"}[5m])) by (stage)",
|
||||
"hide": false,
|
||||
"interval": "",
|
||||
"intervalFactor": 10,
|
||||
"legendFormat": "requests",
|
||||
"metric": "",
|
||||
"refId": "B",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Requests/sec",
|
||||
"tooltip": {
|
||||
"msResolution": false,
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "cumulative"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"show": true
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"datasource": "Prometheus",
|
||||
"decimals": 2,
|
||||
"editable": true,
|
||||
"error": false,
|
||||
"fill": 1,
|
||||
"grid": {
|
||||
"threshold1": null,
|
||||
"threshold1Color": "rgba(216, 200, 27, 0.27)",
|
||||
"threshold2": null,
|
||||
"threshold2Color": "rgba(234, 112, 112, 0.22)"
|
||||
},
|
||||
"id": 2,
|
||||
"isNew": true,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"links": [],
|
||||
"nullPointMode": "connected",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(nginx_connections_current) by (state)",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{state}}",
|
||||
"metric": "",
|
||||
"refId": "A",
|
||||
"step": 2
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Connections",
|
||||
"tooltip": {
|
||||
"msResolution": false,
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "cumulative"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"show": true
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"datasource": "Prometheus",
|
||||
"decimals": 2,
|
||||
"editable": true,
|
||||
"error": false,
|
||||
"fill": 1,
|
||||
"grid": {
|
||||
"threshold1": null,
|
||||
"threshold1Color": "rgba(216, 200, 27, 0.27)",
|
||||
"threshold2": null,
|
||||
"threshold2Color": "rgba(234, 112, 112, 0.22)"
|
||||
},
|
||||
"id": 1,
|
||||
"isNew": true,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"links": [],
|
||||
"nullPointMode": "connected",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(irate(nginx_connections_processed_total{stage!=\"any\"}[5m])) by (stage)",
|
||||
"hide": false,
|
||||
"interval": "",
|
||||
"intervalFactor": 10,
|
||||
"legendFormat": "{{stage}}",
|
||||
"metric": "",
|
||||
"refId": "B",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Connections rate",
|
||||
"tooltip": {
|
||||
"msResolution": false,
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "cumulative"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"show": true
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Nginx exporter metrics"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"editable": true,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"datasource": null,
|
||||
"editable": true,
|
||||
"error": false,
|
||||
"fill": 1,
|
||||
"grid": {
|
||||
"threshold1": null,
|
||||
"threshold1Color": "rgba(216, 200, 27, 0.27)",
|
||||
"threshold2": null,
|
||||
"threshold2Color": "rgba(234, 112, 112, 0.22)"
|
||||
},
|
||||
"id": 4,
|
||||
"isNew": true,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"links": [],
|
||||
"nullPointMode": "connected",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu_seconds_total{mode=\"system\"}) * 100",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "nginx",
|
||||
"refId": "A",
|
||||
"step": 2
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "CPU usage",
|
||||
"tooltip": {
|
||||
"msResolution": false,
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "cumulative"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"show": true
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Nginx container metrics"
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
"from": "now-15m",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"refresh": "10s",
|
||||
"schemaVersion": 12,
|
||||
"version": 9,
|
||||
"links": [],
|
||||
"gnetId": null
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
orgId: 1
|
||||
url: http://prometheus:9090
|
||||
basicAuth: false
|
||||
isDefault: true
|
||||
editable: true
|
|
@ -0,0 +1,22 @@
|
|||
# Prometheus on EC2 & ECS:
|
||||
|
||||
Some helpers for anyone configuring Prometheus on ECS and AWS EC2.
|
||||
|
||||
To get started on AWS ECS and EC2:
|
||||
|
||||
*For EC2/ECS nodes*:
|
||||
- Import the ecs task definition and add cadvisor and node-exporter service/task definition and run them on each host you want to be monitored
|
||||
- Any hosts which have "Monitoring: On" tag will be automatically added in the targets
|
||||
- Expose ports 9100 and 9191 to your Prometheus host
|
||||
|
||||
*For Prometheus host*:
|
||||
|
||||
- Copy prometheus.yml configuration present here to base prometheus configuration to enable EC2 service discovery
|
||||
- `docker compose up -d`
|
||||
|
||||
**Note**:
|
||||
Set query.staleness-delta to 1m make metrics more realtime
|
||||
|
||||
|
||||
### TODO
|
||||
- Add alerting rules based on ECS
|
|
@ -0,0 +1,78 @@
|
|||
{
|
||||
"family": "cadvisor",
|
||||
"containerDefinitions": [
|
||||
{
|
||||
"name": "cadvisor",
|
||||
"image": "google/cadvisor",
|
||||
"cpu": 10,
|
||||
"memory": 300,
|
||||
"portMappings": [
|
||||
{
|
||||
"containerPort": 9191,
|
||||
"hostPort": 9191
|
||||
}
|
||||
],
|
||||
"essential": true,
|
||||
"privileged": true,
|
||||
"mountPoints": [
|
||||
{
|
||||
"sourceVolume": "root",
|
||||
"containerPath": "/rootfs",
|
||||
"readOnly": true
|
||||
},
|
||||
{
|
||||
"sourceVolume": "var_run",
|
||||
"containerPath": "/var/run",
|
||||
"readOnly": false
|
||||
},
|
||||
{
|
||||
"sourceVolume": "sys",
|
||||
"containerPath": "/sys",
|
||||
"readOnly": true
|
||||
},
|
||||
{
|
||||
"sourceVolume": "var_lib_docker",
|
||||
"containerPath": "/var/lib/docker",
|
||||
"readOnly": true
|
||||
},
|
||||
{
|
||||
"sourceVolume": "cgroup",
|
||||
"containerPath": "/cgroup",
|
||||
"readOnly": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"volumes": [
|
||||
{
|
||||
"name": "root",
|
||||
"host": {
|
||||
"sourcePath": "/"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "var_run",
|
||||
"host": {
|
||||
"sourcePath": "/var/run"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "sys",
|
||||
"host": {
|
||||
"sourcePath": "/sys"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "var_lib_docker",
|
||||
"host": {
|
||||
"sourcePath": "/var/lib/docker/"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "cgroup",
|
||||
"host": {
|
||||
"sourcePath": "/cgroup"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
{
|
||||
"family": "prometheus",
|
||||
"containerDefinitions": [
|
||||
{
|
||||
"portMappings": [
|
||||
{
|
||||
"hostPort": 9100,
|
||||
"containerPort": 9100,
|
||||
"protocol": "tcp"
|
||||
}
|
||||
],
|
||||
"essential": true,
|
||||
"name": "node_exporter",
|
||||
"image": "prom/node-exporter",
|
||||
"cpu": 0,
|
||||
"privileged": null,
|
||||
"memoryReservation": 150
|
||||
}
|
||||
],
|
||||
"volumes": [],
|
||||
"networkMode": "host"
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
# Attach these labels to any time series or alerts when communicating with
|
||||
# external systems (federation, remote storage, Alertmanager).
|
||||
external_labels:
|
||||
monitor: 'docker-host-alpha'
|
||||
|
||||
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
|
||||
rule_files:
|
||||
- "targets.rules"
|
||||
- "hosts.rules"
|
||||
- "containers.rules"
|
||||
|
||||
# A scrape configuration containing exactly one endpoint to scrape.
|
||||
scrape_configs:
|
||||
- job_name: 'nodeexporter'
|
||||
scrape_interval: 5s
|
||||
static_configs:
|
||||
- targets: ['nodeexporter:9100']
|
||||
|
||||
- job_name: 'cadvisor'
|
||||
scrape_interval: 5s
|
||||
static_configs:
|
||||
- targets: ['cadvisor:8080']
|
||||
|
||||
- job_name: 'prometheus'
|
||||
scrape_interval: 10s
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
|
||||
# sample scrape configuration for AWS EC2
|
||||
- job_name: 'nodeexporter'
|
||||
ec2_sd_configs:
|
||||
- region: us-east-1
|
||||
port: 9100
|
||||
relabel_configs:
|
||||
# Only monitor instances which have a tag called Monitoring "Monitoring"
|
||||
- source_labels: [__meta_ec2_tag_Monitoring]
|
||||
regex: On
|
||||
action: keep
|
||||
|
||||
- job_name: 'cadvisor'
|
||||
ec2_sd_configs:
|
||||
- region: us-east-1
|
||||
port: 9010
|
||||
relabel_configs:
|
||||
# Only monitor instances which have a tag called Monitoring "Monitoring"
|
||||
- source_labels: [__meta_ec2_tag_Monitoring]
|
||||
regex: On
|
||||
action: keep
|
|
@ -0,0 +1,70 @@
|
|||
groups:
|
||||
- name: targets
|
||||
rules:
|
||||
- alert: monitor_service_down
|
||||
expr: up == 0
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Monitor service non-operational"
|
||||
description: "Service {{ $labels.instance }} is down."
|
||||
|
||||
- name: host
|
||||
rules:
|
||||
- alert: high_cpu_load
|
||||
expr: node_load1 > 1.5
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Server under high load"
|
||||
description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
|
||||
- alert: high_memory_load
|
||||
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Server memory is almost full"
|
||||
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
|
||||
- alert: high_storage_load
|
||||
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Server storage is almost full"
|
||||
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
|
||||
- name: containers
|
||||
rules:
|
||||
- alert: jenkins_down
|
||||
expr: absent(container_memory_usage_bytes{name="jenkins"})
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Jenkins down"
|
||||
description: "Jenkins container is down for more than 30 seconds."
|
||||
|
||||
- alert: jenkins_high_cpu
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Jenkins high CPU usage"
|
||||
description: "Jenkins CPU usage is {{ humanize $value}}%."
|
||||
|
||||
- alert: jenkins_high_memory
|
||||
expr: sum(container_memory_usage_bytes{name="jenkins"}) > 1200000000
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Jenkins high memory usage"
|
||||
description: "Jenkins memory consumption is at {{ humanize $value}}."
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
# Attach these labels to any time series or alerts when communicating with
|
||||
# external systems (federation, remote storage, Alertmanager).
|
||||
external_labels:
|
||||
monitor: 'docker-host-alpha'
|
||||
|
||||
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
|
||||
rule_files:
|
||||
- "alert.rules"
|
||||
|
||||
# A scrape configuration containing exactly one endpoint to scrape.
|
||||
scrape_configs:
|
||||
- job_name: 'nodeexporter'
|
||||
scrape_interval: 5s
|
||||
static_configs:
|
||||
- targets: ['nodeexporter:9100']
|
||||
|
||||
- job_name: 'cadvisor'
|
||||
scrape_interval: 5s
|
||||
static_configs:
|
||||
- targets: ['cadvisor:8080']
|
||||
|
||||
- job_name: 'prometheus'
|
||||
scrape_interval: 10s
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: 'pushgateway'
|
||||
scrape_interval: 10s
|
||||
honor_labels: true
|
||||
static_configs:
|
||||
- targets: ['pushgateway:9091']
|
||||
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- scheme: http
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'alertmanager:9093'
|
||||
|
||||
# - job_name: 'nginx'
|
||||
# scrape_interval: 10s
|
||||
# static_configs:
|
||||
# - targets: ['nginxexporter:9113']
|
||||
|
||||
# - job_name: 'aspnetcore'
|
||||
# scrape_interval: 10s
|
||||
# static_configs:
|
||||
# - targets: ['eventlog-proxy:5000', 'eventlog:5000']
|
Binary file not shown.
After Width: | Height: | Size: 921 KiB |
Binary file not shown.
After Width: | Height: | Size: 918 KiB |
Binary file not shown.
After Width: | Height: | Size: 513 KiB |
Binary file not shown.
After Width: | Height: | Size: 408 KiB |
Binary file not shown.
After Width: | Height: | Size: 235 KiB |
Binary file not shown.
After Width: | Height: | Size: 277 KiB |
Loading…
Reference in New Issue