initial commit

2020-04-09 16:45:33 -07:00
commit 3caac51ca1
28 changed files with 7517 additions and 0 deletions
--- a/.env
+++ b/.env
@@ -0,0 +1,2 @@
 ADMIN_USER=admin
 ADMIN_PASSWORD=admin
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,63 @@
 ###############################################################################
 # Set default behavior to automatically normalize line endings.
 ###############################################################################
 * text eol=lf
 ###############################################################################
 # Set default behavior for command prompt diff.
 #
 # This is need for earlier builds of msysgit that does not have it on by
 # default for csharp files.
 # Note: This is only used by command line
 ###############################################################################
 #*.cs     diff=csharp
 ###############################################################################
 # Set the merge driver for project and solution files
 #
 # Merging from the command prompt will add diff markers to the files if there
 # are conflicts (Merging from VS is not affected by the settings below, in VS
 # the diff markers are never inserted). Diff markers may cause the following 
 # file extensions to fail to load in VS. An alternative would be to treat
 # these files as binary and thus will always conflict and require user
 # intervention with every merge. To do so, just uncomment the entries below
 ###############################################################################
 #*.sln       merge=binary
 #*.csproj    merge=binary
 #*.vbproj    merge=binary
 #*.vcxproj   merge=binary
 #*.vcproj    merge=binary
 #*.dbproj    merge=binary
 #*.fsproj    merge=binary
 #*.lsproj    merge=binary
 #*.wixproj   merge=binary
 #*.modelproj merge=binary
 #*.sqlproj   merge=binary
 #*.wwaproj   merge=binary
 ###############################################################################
 # behavior for image files
 #
 # image files are treated as binary by default.
 ###############################################################################
 *.jpg   binary
 *.png   binary
 *.gif   binary
 ###############################################################################
 # diff behavior for common document formats
 # 
 # Convert binary document formats to text before diffing them. This feature
 # is only available from the command line. Turn it on by uncommenting the 
 # entries below.
 ###############################################################################
 #*.doc   diff=astextplain
 #*.DOC   diff=astextplain
 #*.docx  diff=astextplain
 #*.DOCX  diff=astextplain
 #*.dot   diff=astextplain
 #*.DOT   diff=astextplain
 #*.pdf   diff=astextplain
 #*.PDF   diff=astextplain
 #*.rtf   diff=astextplain
 #*.RTF   diff=astextplain
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,4 @@
 *DS_Store
 .vs/
 .vscode/
 .idea/
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2016 Stefan Prodan
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,363 @@
 Prometheus-Grafana
 ========
 A monitoring solution for Docker hosts and containers with [Prometheus](https://prometheus.io/), [Grafana](http://grafana.org/), [cAdvisor](https://github.com/google/cadvisor),
 [NodeExporter](https://github.com/prometheus/node_exporter) and alerting with [AlertManager](https://github.com/prometheus/alertmanager).  
 This is a forked repository. So, you may want to visit the original repo at [stefanprodan
 /
 dockprom](https://github.com/stefanprodan/dockprom)
 Additional info: [Docker - Prometheus and Grafana](https://bogotobogo.com/DevOps/Docker/Docker_Prometheus_Grafana.php)
 ## Install
 ### Create .env:
 ```
 ADMIN_USER=admin  
 ADMIN_PASSWORD=admin
 ```
 ### Clone this repository on your Docker host, cd into test directory and run compose up:
 ```
 git clone https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana.git
 cd Docker-Compose-Prometheus-and-Grafana
 docker-compose up -d
 ```
 ## Prerequisites:
 * Docker Engine >= 1.13
 * Docker Compose >= 1.11
 ## Containers:
 * Prometheus (metrics database) `http://<host-ip>:9090`
 * Prometheus-Pushgateway (push acceptor for ephemeral and batch jobs) `http://<host-ip>:9091`
 * AlertManager (alerts management) `http://<host-ip>:9093`
 * Grafana (visualize metrics) `http://<host-ip>:3000`
 * NodeExporter (host metrics collector)
 * cAdvisor (containers metrics collector)
 * Caddy (reverse proxy and basic auth provider for prometheus and alertmanager)
 ## Setup Grafana
 Navigate to `http://<host-ip>:3000` and login with user ***admin*** password ***admin***. You can change the credentials in the compose file or by supplying the `ADMIN_USER` and `ADMIN_PASSWORD` environment variables via .env file on compose up. The config file can be added directly in grafana part like this
 ```
 grafana:
  image: grafana/grafana:5.2.4
  env_file:
    - config
 ```
 and the config file format should have this content
 ```
 GF_SECURITY_ADMIN_USER=admin
 GF_SECURITY_ADMIN_PASSWORD=changeme
 GF_USERS_ALLOW_SIGN_UP=false
 ```
 If you want to change the password, you have to remove this entry, otherwise the change will not take effect
 ```
 - grafana_data:/var/lib/grafana
 ```
 Grafana is preconfigured with dashboards and Prometheus as the default data source:
 * Name: Prometheus
 * Type: Prometheus
 * Url: http://prometheus:9090
 * Access: proxy
 ***Docker Host Dashboard***
 ![Host](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Grafana_Docker_Host.png)
 The Docker Host Dashboard shows key metrics for monitoring the resource usage of your server:
 * Server uptime, CPU idle percent, number of CPU cores, available memory, swap and storage
 * System load average graph, running and blocked by IO processes graph, interrupts graph
 * CPU usage graph by mode (guest, idle, iowait, irq, nice, softirq, steal, system, user)
 * Memory usage graph by distribution (used, free, buffers, cached)
 * IO usage graph (read Bps, read Bps and IO time)
 * Network usage graph by device (inbound Bps, Outbound Bps)
 * Swap usage and activity graphs
 For storage and particularly Free Storage graph, you have to specify the fstype in grafana graph request.
 You can find it in `grafana/dashboards/docker_host.json`, at line 480 :
      "expr": "sum(node_filesystem_free_bytes{fstype=\"btrfs\"})",
 I work on BTRFS, so i need to change `aufs` to `btrfs`.
 You can find right value for your system in Prometheus `http://<host-ip>:9090` launching this request :
      node_filesystem_free_bytes
 ***Docker Containers Dashboard***
 ![Containers](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Grafana_Docker_Containers.png)
 The Docker Containers Dashboard shows key metrics for monitoring running containers:
 * Total containers CPU load, memory and storage usage
 * Running containers graph, system load graph, IO usage graph
 * Container CPU usage graph
 * Container memory usage graph
 * Container cached memory usage graph
 * Container network inbound usage graph
 * Container network outbound usage graph
 Note that this dashboard doesn't show the containers that are part of the monitoring stack.
 ***Monitor Services Dashboard***
 ![Monitor Services](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Grafana_Prometheus.png)
 ![Monitor Services](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Grafana_Prometheus2.png)
 ![Monitor Services](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Grafana_Prometheus3.png)
 The Monitor Services Dashboard shows key metrics for monitoring the containers that make up the monitoring stack:
 * Prometheus container uptime, monitoring stack total memory usage, Prometheus local storage memory chunks and series
 * Container CPU usage graph
 * Container memory usage graph
 * Prometheus chunks to persist and persistence urgency graphs
 * Prometheus chunks ops and checkpoint duration graphs
 * Prometheus samples ingested rate, target scrapes and scrape duration graphs
 * Prometheus HTTP requests graph
 * Prometheus alerts graph
 ## Define alerts
 Three alert groups have been setup within the [alert.rules](https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/blob/master/prometheus/alert.rules) configuration file:
 * Monitoring services alerts [targets](https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/blob/master/prometheus/alert.rules#L2-L11)
 * Docker Host alerts [host](https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/blob/master/prometheus/alert.rules#L13-L40)
 * Docker Containers alerts [containers](https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/blob/master/prometheus/alert.rules#L42-L69)
 You can modify the alert rules and reload them by making a HTTP POST call to Prometheus:
 ```
 curl -X POST http://admin:admin@<host-ip>:9090/-/reload
 ```
 ***Monitoring services alerts***
 Trigger an alert if any of the monitoring targets (node-exporter and cAdvisor) are down for more than 30 seconds:
 ```yaml
 - alert: monitor_service_down
    expr: up == 0
    for: 30s
    labels:
      severity: critical
    annotations:
      summary: "Monitor service non-operational"
      description: "Service {{ $labels.instance }} is down."
 ```
 ***Docker Host alerts***
 Trigger an alert if the Docker host CPU is under high load for more than 30 seconds:
 ```yaml
 - alert: high_cpu_load
    expr: node_load1 > 1.5
    for: 30s
    labels:
      severity: warning
    annotations:
      summary: "Server under high load"
      description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
 ```
 Modify the load threshold based on your CPU cores.
 Trigger an alert if the Docker host memory is almost full:
 ```yaml
 - alert: high_memory_load
    expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
    for: 30s
    labels:
      severity: warning
    annotations:
      summary: "Server memory is almost full"
      description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
 ```
 Trigger an alert if the Docker host storage is almost full:
 ```yaml
 - alert: high_storage_load
    expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"}  * 100 > 85
    for: 30s
    labels:
      severity: warning
    annotations:
      summary: "Server storage is almost full"
      description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
 ```
 ***Docker Containers alerts***
 Trigger an alert if a container is down for more than 30 seconds:
 ```yaml
 - alert: jenkins_down
    expr: absent(container_memory_usage_bytes{name="jenkins"})
    for: 30s
    labels:
      severity: critical
    annotations:
      summary: "Jenkins down"
      description: "Jenkins container is down for more than 30 seconds."
 ```
 Trigger an alert if a container is using more than 10% of total CPU cores for more than 30 seconds:
 ```yaml
 - alert: jenkins_high_cpu
    expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10
    for: 30s
    labels:
      severity: warning
    annotations:
      summary: "Jenkins high CPU usage"
      description: "Jenkins CPU usage is {{ humanize $value}}%."
 ```
 Trigger an alert if a container is using more than 1.2GB of RAM for more than 30 seconds:
 ```yaml
 - alert: jenkins_high_memory
    expr: sum(container_memory_usage_bytes{name="jenkins"}) > 1200000000
    for: 30s
    labels:
      severity: warning
    annotations:
      summary: "Jenkins high memory usage"
      description: "Jenkins memory consumption is at {{ humanize $value}}."
 ```
 ## Setup alerting
 The AlertManager service is responsible for handling alerts sent by Prometheus server.
 AlertManager can send notifications via email, Pushover, Slack, HipChat or any other system that exposes a webhook interface.
 A complete list of integrations can be found [here](https://prometheus.io/docs/alerting/configuration).
 You can view and silence notifications by accessing `http://<host-ip>:9093`.
 The notification receivers can be configured in [alertmanager/config.yml](https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/blob/master/alertmanager/config.yml) file.
 To receive alerts via Slack you need to make a custom integration by choose ***incoming web hooks*** in your Slack team app page.
 You can find more details on setting up Slack integration [here](http://www.robustperception.io/using-slack-with-the-alertmanager/).
 Copy the Slack Webhook URL into the ***api_url*** field and specify a Slack ***channel***.
 ```yaml
 route:
    receiver: 'slack'
 receivers:
    - name: 'slack'
      slack_configs:
          - send_resolved: true
            text: "{{ .CommonAnnotations.description }}"
            username: 'Prometheus'
            channel: '#<channel>'
            api_url: 'https://hooks.slack.com/services/<webhook-id>'
 ```
 ![Slack Notifications](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Slack_Notifications.png)
 ## Sending metrics to the Pushgateway
 The [pushgateway](https://github.com/prometheus/pushgateway) is used to collect data from batch jobs or from services.
 To push data, simply execute:
    echo "some_metric 3.14" | curl --data-binary @- http://user:password@localhost:9091/metrics/job/some_job
 Please replace the `user:password` part with your user and password set in the initial configuration (default: `admin:admin`).
 ## Updating Grafana to v5.2.2
 [In Grafana versions >= 5.1 the id of the grafana user has been changed](http://docs.grafana.org/installation/docker/#migration-from-a-previous-version-of-the-docker-container-to-5-1-or-later). Unfortunately this means that files created prior to 5.1 won’t have the correct permissions for later versions.
 | Version |   User  | User ID |
 |:-------:|:-------:|:-------:|
 |  < 5.1  | grafana |   104   |
 |  \>= 5.1 | grafana |   472   |
 There are two possible solutions to this problem.
 - Change ownership from 104 to 472
 - Start the upgraded container as user 104
 ##### Specifying a user in docker-compose.yml
 To change ownership of the files run your grafana container as root and modify the permissions.
 First perform a `docker-compose down` then modify your docker-compose.yml to include the `user: root` option:
 ```
  grafana:
    image: grafana/grafana:5.2.2
    container_name: grafana
    volumes:
      - grafana_data:/var/lib/grafana
      - ./grafana/datasources:/etc/grafana/datasources
      - ./grafana/dashboards:/etc/grafana/dashboards
      - ./grafana/setup.sh:/setup.sh
    entrypoint: /setup.sh
    user: root
    environment:
      - GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin}
      - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin}
      - GF_USERS_ALLOW_SIGN_UP=false
    restart: unless-stopped
    expose:
      - 3000
    networks:
      - monitor-net
    labels:
      org.label-schema.group: "monitoring"
 ```
 Perform a `docker-compose up -d` and then issue the following commands:
 ```
 docker exec -it --user root grafana bash
 # in the container you just started:
 chown -R root:root /etc/grafana && \
 chmod -R a+r /etc/grafana && \
 chown -R grafana:grafana /var/lib/grafana && \
 chown -R grafana:grafana /usr/share/grafana
 ```
 To run the grafana container as `user: 104` change your `docker-compose.yml` like such:
 ```
  grafana:
    image: grafana/grafana:5.2.2
    container_name: grafana
    volumes:
      - grafana_data:/var/lib/grafana
      - ./grafana/datasources:/etc/grafana/datasources
      - ./grafana/dashboards:/etc/grafana/dashboards
      - ./grafana/setup.sh:/setup.sh
    entrypoint: /setup.sh
    user: "104"
    environment:
      - GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin}
      - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin}
      - GF_USERS_ALLOW_SIGN_UP=false
    restart: unless-stopped
    expose:
      - 3000
    networks:
      - monitor-net
    labels:
      org.label-schema.group: "monitoring"
 ```
--- a/alertmanager/config.yml
+++ b/alertmanager/config.yml
@@ -0,0 +1,11 @@
 route:
    receiver: 'slack'
 receivers:
    - name: 'slack'
      slack_configs:
          - send_resolved: true
            text: "{{ .CommonAnnotations.description }}"
            username: 'Prometheus'
            channel: '#prometheus'
            api_url: 'https://hooks.slack.com/services/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'
--- a/caddy/Caddyfile
+++ b/caddy/Caddyfile
@@ -0,0 +1,39 @@
 :9090 {
    basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD}
    proxy / prometheus:9090 {
            transparent
        }
    errors stderr
    tls off
 }
 :9093 {
    basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD}
    proxy / alertmanager:9093 {
            transparent
        }
    errors stderr
    tls off
 }
 :9091 {
    basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD}
    proxy / pushgateway:9091 {
            transparent
        }
    errors stderr
    tls off
 }
 :3000 {
    proxy / grafana:3000 {
            transparent
            websocket
        }
    errors stderr
    tls off
 }
--- a/3
+++ b/3
@@ -0,0 +1,3 @@
 GF_SECURITY_ADMIN_USER=admin
 GF_SECURITY_ADMIN_PASSWORD=changeme
 GF_USERS_ALLOW_SIGN_UP=false
--- a/docker-compose.exporters.yml
+++ b/docker-compose.exporters.yml
@@ -0,0 +1,36 @@
 version: '2.1'
 services:
  nodeexporter:
    image: prom/node-exporter:v0.18.1
    container_name: nodeexporter
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.rootfs=/rootfs'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
    restart: unless-stopped
    network_mode: host
    labels:
      org.label-schema.group: "monitoring"
  cadvisor:
    image: gcr.io/google-containers/cadvisor:v0.34.0
    container_name: cadvisor
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:rw
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
      - /cgroup:/cgroup:ro
    restart: unless-stopped
    network_mode: host
    labels:
      org.label-schema.group: "monitoring"
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,133 @@
 version: '2.1'
 networks:
  monitor-net:
    driver: bridge
 volumes:
    prometheus_data: {}
    grafana_data: {}
 services:
  prometheus:
    image: prom/prometheus:v2.17.1
    container_name: prometheus
    volumes:
      - ./prometheus:/etc/prometheus
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--storage.tsdb.retention.time=200h'
      - '--web.enable-lifecycle'
    restart: unless-stopped
    expose:
      - 9090
    networks:
      - monitor-net
    labels:
      org.label-schema.group: "monitoring"
  alertmanager:
    image: prom/alertmanager:v0.20.0
    container_name: alertmanager
    volumes:
      - ./alertmanager:/etc/alertmanager
    command:
      - '--config.file=/etc/alertmanager/config.yml'
      - '--storage.path=/alertmanager'
    restart: unless-stopped
    expose:
      - 9093
    networks:
      - monitor-net
    labels:
      org.label-schema.group: "monitoring"
  nodeexporter:
    image: prom/node-exporter:v0.18.1
    container_name: nodeexporter
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.rootfs=/rootfs'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
    restart: unless-stopped
    expose:
      - 9100
    networks:
      - monitor-net
    labels:
      org.label-schema.group: "monitoring"
  cadvisor:
    image: gcr.io/google-containers/cadvisor:v0.34.0
    container_name: cadvisor
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:rw
      - /sys:/sys:ro
      - /var/lib/docker:/var/lib/docker:ro
      #- /cgroup:/cgroup:ro #doesn't work on MacOS only for Linux
    restart: unless-stopped
    expose:
      - 8080
    networks:
      - monitor-net
    labels:
      org.label-schema.group: "monitoring"
  grafana:
    image: grafana/grafana:6.7.2
    container_name: grafana
    volumes:
      - grafana_data:/var/lib/grafana
      - ./grafana/provisioning:/etc/grafana/provisioning
    environment:
      - GF_SECURITY_ADMIN_USER=${ADMIN_USER}
      - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD}
      - GF_USERS_ALLOW_SIGN_UP=false
    restart: unless-stopped
    expose:
      - 3000
    networks:
      - monitor-net
    labels:
      org.label-schema.group: "monitoring"
  pushgateway:
    image: prom/pushgateway:v1.2.0
    container_name: pushgateway
    restart: unless-stopped
    expose:
      - 9091
    networks:
      - monitor-net
    labels:
      org.label-schema.group: "monitoring"
  caddy:
    image: stefanprodan/caddy
    container_name: caddy
    ports:
      - "3000:3000"
      - "9090:9090"
      - "9093:9093"
      - "9091:9091"
    volumes:
      - ./caddy:/etc/caddy
    environment:
      - ADMIN_USER=${ADMIN_USER}
      - ADMIN_PASSWORD=${ADMIN_PASSWORD}
    restart: unless-stopped
    networks:
      - monitor-net
    labels:
      org.label-schema.group: "monitoring"
--- a/grafana/provisioning/dashboards/dashboard.yml
+++ b/grafana/provisioning/dashboards/dashboard.yml
@@ -0,0 +1,12 @@
 apiVersion: 1
 providers:
  - name: 'Prometheus'
    orgId: 1
    folder: ''
    type: file
    disableDeletion: false
    editable: true
    allowUiUpdates: true
    options:
      path: /etc/grafana/provisioning/dashboards
--- a/grafana/provisioning/dashboards/docker_containers.json
+++ b/grafana/provisioning/dashboards/docker_containers.json
--- a/grafana/provisioning/dashboards/docker_host.json
+++ b/grafana/provisioning/dashboards/docker_host.json
--- a/grafana/provisioning/dashboards/monitor_services.json
+++ b/grafana/provisioning/dashboards/monitor_services.json
--- a/grafana/provisioning/dashboards/nginx_container.json
+++ b/grafana/provisioning/dashboards/nginx_container.json
@@ -0,0 +1,398 @@
 {
  "id": null,
  "title": "Nginx",
  "description": "Nginx exporter metrics",
  "tags": [
    "nginx"
  ],
  "style": "dark",
  "timezone": "browser",
  "editable": true,
  "hideControls": false,
  "sharedCrosshair": true,
  "rows": [
    {
      "collapse": false,
      "editable": true,
      "height": "250px",
      "panels": [
        {
          "aliasColors": {},
          "bars": false,
          "datasource": "Prometheus",
          "decimals": 2,
          "editable": true,
          "error": false,
          "fill": 1,
          "grid": {
            "threshold1": null,
            "threshold1Color": "rgba(216, 200, 27, 0.27)",
            "threshold2": null,
            "threshold2Color": "rgba(234, 112, 112, 0.22)"
          },
          "id": 3,
          "isNew": true,
          "legend": {
            "alignAsTable": true,
            "avg": true,
            "current": true,
            "max": true,
            "min": true,
            "rightSide": true,
            "show": true,
            "total": false,
            "values": true
          },
          "lines": true,
          "linewidth": 2,
          "links": [],
          "nullPointMode": "connected",
          "percentage": false,
          "pointradius": 5,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "span": 12,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "sum(irate(nginx_connections_processed_total{stage=\"any\"}[5m])) by (stage)",
              "hide": false,
              "interval": "",
              "intervalFactor": 10,
              "legendFormat": "requests",
              "metric": "",
              "refId": "B",
              "step": 10
            }
          ],
          "timeFrom": null,
          "timeShift": null,
          "title": "Requests/sec",
          "tooltip": {
            "msResolution": false,
            "shared": true,
            "sort": 0,
            "value_type": "cumulative"
          },
          "type": "graph",
          "xaxis": {
            "show": true
          },
          "yaxes": [
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": 0,
              "show": true
            },
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ]
        },
        {
          "aliasColors": {},
          "bars": false,
          "datasource": "Prometheus",
          "decimals": 2,
          "editable": true,
          "error": false,
          "fill": 1,
          "grid": {
            "threshold1": null,
            "threshold1Color": "rgba(216, 200, 27, 0.27)",
            "threshold2": null,
            "threshold2Color": "rgba(234, 112, 112, 0.22)"
          },
          "id": 2,
          "isNew": true,
          "legend": {
            "alignAsTable": true,
            "avg": true,
            "current": true,
            "max": true,
            "min": true,
            "rightSide": true,
            "show": true,
            "total": false,
            "values": true
          },
          "lines": true,
          "linewidth": 2,
          "links": [],
          "nullPointMode": "connected",
          "percentage": false,
          "pointradius": 5,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "span": 12,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "sum(nginx_connections_current) by (state)",
              "interval": "",
              "intervalFactor": 2,
              "legendFormat": "{{state}}",
              "metric": "",
              "refId": "A",
              "step": 2
            }
          ],
          "timeFrom": null,
          "timeShift": null,
          "title": "Connections",
          "tooltip": {
            "msResolution": false,
            "shared": true,
            "sort": 0,
            "value_type": "cumulative"
          },
          "type": "graph",
          "xaxis": {
            "show": true
          },
          "yaxes": [
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": 0,
              "show": true
            },
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ]
        },
        {
          "aliasColors": {},
          "bars": false,
          "datasource": "Prometheus",
          "decimals": 2,
          "editable": true,
          "error": false,
          "fill": 1,
          "grid": {
            "threshold1": null,
            "threshold1Color": "rgba(216, 200, 27, 0.27)",
            "threshold2": null,
            "threshold2Color": "rgba(234, 112, 112, 0.22)"
          },
          "id": 1,
          "isNew": true,
          "legend": {
            "alignAsTable": true,
            "avg": true,
            "current": true,
            "max": true,
            "min": true,
            "rightSide": true,
            "show": true,
            "total": false,
            "values": true
          },
          "lines": true,
          "linewidth": 2,
          "links": [],
          "nullPointMode": "connected",
          "percentage": false,
          "pointradius": 5,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "span": 12,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "sum(irate(nginx_connections_processed_total{stage!=\"any\"}[5m])) by (stage)",
              "hide": false,
              "interval": "",
              "intervalFactor": 10,
              "legendFormat": "{{stage}}",
              "metric": "",
              "refId": "B",
              "step": 10
            }
          ],
          "timeFrom": null,
          "timeShift": null,
          "title": "Connections rate",
          "tooltip": {
            "msResolution": false,
            "shared": true,
            "sort": 0,
            "value_type": "cumulative"
          },
          "type": "graph",
          "xaxis": {
            "show": true
          },
          "yaxes": [
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": 0,
              "show": true
            },
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ]
        }
      ],
      "title": "Nginx exporter metrics"
    },
    {
      "collapse": false,
      "editable": true,
      "height": "250px",
      "panels": [
        {
          "aliasColors": {},
          "bars": false,
          "datasource": null,
          "editable": true,
          "error": false,
          "fill": 1,
          "grid": {
            "threshold1": null,
            "threshold1Color": "rgba(216, 200, 27, 0.27)",
            "threshold2": null,
            "threshold2Color": "rgba(234, 112, 112, 0.22)"
          },
          "id": 4,
          "isNew": true,
          "legend": {
            "alignAsTable": true,
            "avg": true,
            "current": true,
            "max": true,
            "min": true,
            "rightSide": true,
            "show": true,
            "total": false,
            "values": true
          },
          "lines": true,
          "linewidth": 2,
          "links": [],
          "nullPointMode": "connected",
          "percentage": false,
          "pointradius": 5,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "span": 12,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu_seconds_total{mode=\"system\"}) * 100",
              "intervalFactor": 2,
              "legendFormat": "nginx",
              "refId": "A",
              "step": 2
            }
          ],
          "timeFrom": null,
          "timeShift": null,
          "title": "CPU usage",
          "tooltip": {
            "msResolution": false,
            "shared": true,
            "sort": 0,
            "value_type": "cumulative"
          },
          "type": "graph",
          "xaxis": {
            "show": true
          },
          "yaxes": [
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            },
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ]
        }
      ],
      "title": "Nginx container metrics"
    }
  ],
  "time": {
    "from": "now-15m",
    "to": "now"
  },
  "timepicker": {
    "refresh_intervals": [
      "5s",
      "10s",
      "30s",
      "1m",
      "5m",
      "15m",
      "30m",
      "1h",
      "2h",
      "1d"
    ],
    "time_options": [
      "5m",
      "15m",
      "1h",
      "6h",
      "12h",
      "24h",
      "2d",
      "7d",
      "30d"
    ]
  },
    "templating": {
    "list": []
  },
  "annotations": {
    "list": []
  },
  "refresh": "10s",
  "schemaVersion": 12,
  "version": 9,
  "links": [],
  "gnetId": null
 }
--- a/grafana/provisioning/datasources/datasource.yml
+++ b/grafana/provisioning/datasources/datasource.yml
@@ -0,0 +1,11 @@
 apiVersion: 1
 datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    orgId: 1
    url: http://prometheus:9090
    basicAuth: false
    isDefault: true
    editable: true
--- a/helpers/aws/README.md
+++ b/helpers/aws/README.md
@@ -0,0 +1,22 @@
 # Prometheus on EC2 & ECS:
 Some helpers for anyone configuring Prometheus on ECS and AWS EC2.
 To get started on AWS ECS and EC2:
 *For EC2/ECS nodes*:
 - Import the ecs task definition and add cadvisor and node-exporter service/task definition and run them on each host you want to be monitored
 - Any hosts which have "Monitoring: On" tag will be automatically added in the targets
 - Expose ports 9100 and 9191 to your Prometheus host
 *For Prometheus host*:
 - Copy prometheus.yml configuration present here to base prometheus configuration to enable EC2 service discovery
 - `docker compose up -d`
 **Note**:
 Set query.staleness-delta to 1m make metrics more realtime
 ### TODO
 - Add alerting rules based on ECS
--- a/helpers/aws/cadvisor_ecs_task_definition.json
+++ b/helpers/aws/cadvisor_ecs_task_definition.json
@@ -0,0 +1,78 @@
 {
    "family": "cadvisor",
    "containerDefinitions": [
        {
            "name": "cadvisor",
            "image": "google/cadvisor",
            "cpu": 10,
            "memory": 300,
            "portMappings": [
                {
                    "containerPort": 9191,
                    "hostPort": 9191
                }
            ],
            "essential": true,
            "privileged": true,
            "mountPoints": [
                {
                    "sourceVolume": "root",
                    "containerPath": "/rootfs",
                    "readOnly": true
                },
                {
                    "sourceVolume": "var_run",
                    "containerPath": "/var/run",
                    "readOnly": false
                },
                {
                    "sourceVolume": "sys",
                    "containerPath": "/sys",
                    "readOnly": true
                },
                {
                    "sourceVolume": "var_lib_docker",
                    "containerPath": "/var/lib/docker",
                    "readOnly": true
                },
                {
                    "sourceVolume": "cgroup",
                    "containerPath": "/cgroup",
                    "readOnly": true
                }
            ]
        }
    ],
    "volumes": [
        {
            "name": "root",
            "host": {
                "sourcePath": "/"
            }
        },
        {
            "name": "var_run",
            "host": {
                "sourcePath": "/var/run"
            }
        },
        {
            "name": "sys",
            "host": {
                "sourcePath": "/sys"
            }
        },
        {
            "name": "var_lib_docker",
            "host": {
                "sourcePath": "/var/lib/docker/"
            }
        },
        {
            "name": "cgroup",
            "host": {
                "sourcePath": "/cgroup"
            }
        }
    ]
 }
--- a/helpers/aws/node_exporter_task_definition.json
+++ b/helpers/aws/node_exporter_task_definition.json
@@ -0,0 +1,22 @@
 {
    "family": "prometheus",
    "containerDefinitions": [
        {
            "portMappings": [
                {
                    "hostPort": 9100,
                    "containerPort": 9100,
                    "protocol": "tcp"
                }
            ],
            "essential": true,
            "name": "node_exporter",
            "image": "prom/node-exporter",
            "cpu": 0,
            "privileged": null,
            "memoryReservation": 150
        }
    ],
    "volumes": [],
    "networkMode": "host"
 }
--- a/helpers/aws/prometheus.yml
+++ b/helpers/aws/prometheus.yml
@@ -0,0 +1,53 @@
 global:
  scrape_interval:     15s
  evaluation_interval: 15s
  # Attach these labels to any time series or alerts when communicating with
  # external systems (federation, remote storage, Alertmanager).
  external_labels:
      monitor: 'docker-host-alpha'
 # Load and evaluate rules in this file every 'evaluation_interval' seconds.
 rule_files:
  - "targets.rules"
  - "hosts.rules"
  - "containers.rules"
 # A scrape configuration containing exactly one endpoint to scrape.
 scrape_configs:
  - job_name: 'nodeexporter'
    scrape_interval: 5s
    static_configs:
      - targets: ['nodeexporter:9100']
  - job_name: 'cadvisor'
    scrape_interval: 5s
    static_configs:
      - targets: ['cadvisor:8080']
  - job_name: 'prometheus'
    scrape_interval: 10s
    static_configs:
      - targets: ['localhost:9090']
 # sample scrape configuration for AWS EC2
  - job_name: 'nodeexporter'
    ec2_sd_configs:
      - region: us-east-1
        port: 9100
    relabel_configs:
        # Only monitor instances which have a tag called Monitoring "Monitoring"
      - source_labels: [__meta_ec2_tag_Monitoring]
        regex: On
        action: keep
  - job_name: 'cadvisor'
    ec2_sd_configs:
      - region: us-east-1
        port: 9010
    relabel_configs:
        # Only monitor instances which have a tag called Monitoring "Monitoring"
      - source_labels: [__meta_ec2_tag_Monitoring]
        regex: On
        action: keep
--- a/prometheus/alert.rules
+++ b/prometheus/alert.rules
@@ -0,0 +1,70 @@
 groups:
 - name: targets
  rules:
  - alert: monitor_service_down
    expr: up == 0
    for: 30s
    labels:
      severity: critical
    annotations:
      summary: "Monitor service non-operational"
      description: "Service {{ $labels.instance }} is down."
 - name: host
  rules:
  - alert: high_cpu_load
    expr: node_load1 > 1.5
    for: 30s
    labels:
      severity: warning
    annotations:
      summary: "Server under high load"
      description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
  - alert: high_memory_load
    expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
    for: 30s
    labels:
      severity: warning
    annotations:
      summary: "Server memory is almost full"
      description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
  - alert: high_storage_load
    expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"}  * 100 > 85
    for: 30s
    labels:
      severity: warning
    annotations:
      summary: "Server storage is almost full"
      description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
 - name: containers
  rules:
  - alert: jenkins_down
    expr: absent(container_memory_usage_bytes{name="jenkins"})
    for: 30s
    labels:
      severity: critical
    annotations:
      summary: "Jenkins down"
      description: "Jenkins container is down for more than 30 seconds."
  - alert: jenkins_high_cpu
    expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10
    for: 30s
    labels:
      severity: warning
    annotations:
      summary: "Jenkins high CPU usage"
      description: "Jenkins CPU usage is {{ humanize $value}}%."
  - alert: jenkins_high_memory
    expr: sum(container_memory_usage_bytes{name="jenkins"}) > 1200000000
    for: 30s
    labels:
      severity: warning
    annotations:
      summary: "Jenkins high memory usage"
      description: "Jenkins memory consumption is at {{ humanize $value}}."
--- a/prometheus/prometheus.yml
+++ b/prometheus/prometheus.yml
@@ -0,0 +1,53 @@
 global:
  scrape_interval:     15s
  evaluation_interval: 15s
  # Attach these labels to any time series or alerts when communicating with
  # external systems (federation, remote storage, Alertmanager).
  external_labels:
      monitor: 'docker-host-alpha'
 # Load and evaluate rules in this file every 'evaluation_interval' seconds.
 rule_files:
  - "alert.rules"
 # A scrape configuration containing exactly one endpoint to scrape.
 scrape_configs:
  - job_name: 'nodeexporter'
    scrape_interval: 5s
    static_configs:
      - targets: ['nodeexporter:9100']
  - job_name: 'cadvisor'
    scrape_interval: 5s
    static_configs:
      - targets: ['cadvisor:8080']
  - job_name: 'prometheus'
    scrape_interval: 10s
    static_configs:
      - targets: ['localhost:9090']
  - job_name: 'pushgateway'
    scrape_interval: 10s
    honor_labels: true
    static_configs:
      - targets: ['pushgateway:9091']
 alerting:
  alertmanagers:
  - scheme: http
    static_configs:
    - targets: 
      - 'alertmanager:9093'
 #  - job_name: 'nginx'
 #    scrape_interval: 10s
 #    static_configs:
 #      - targets: ['nginxexporter:9113']
 #  - job_name: 'aspnetcore'
 #    scrape_interval: 10s
 #    static_configs:
 #      - targets: ['eventlog-proxy:5000', 'eventlog:5000']
--- a/screens/Grafana_Docker_Containers.png
+++ b/screens/Grafana_Docker_Containers.png
--- a/screens/Grafana_Docker_Host.png
+++ b/screens/Grafana_Docker_Host.png
--- a/screens/Grafana_Prometheus.png
+++ b/screens/Grafana_Prometheus.png
--- a/screens/Grafana_Prometheus2.png
+++ b/screens/Grafana_Prometheus2.png
--- a/screens/Grafana_Prometheus3.png
+++ b/screens/Grafana_Prometheus3.png
--- a/screens/Slack_Notifications.png
+++ b/screens/Slack_Notifications.png