initial commit

2020-04-09 16:45:33 -07:00
commit 3caac51ca1
28 changed files with 7517 additions and 0 deletions
--- a/.env
+++ b/.env
@@ -0,0 +1,2 @@
+ADMIN_USER=admin
+ADMIN_PASSWORD=admin
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,63 @@
+###############################################################################
+# Set default behavior to automatically normalize line endings.
+###############################################################################
+* text eol=lf
+
+###############################################################################
+# Set default behavior for command prompt diff.
+#
+# This is need for earlier builds of msysgit that does not have it on by
+# default for csharp files.
+# Note: This is only used by command line
+###############################################################################
+#*.cs     diff=csharp
+
+###############################################################################
+# Set the merge driver for project and solution files
+#
+# Merging from the command prompt will add diff markers to the files if there
+# are conflicts (Merging from VS is not affected by the settings below, in VS
+# the diff markers are never inserted). Diff markers may cause the following 
+# file extensions to fail to load in VS. An alternative would be to treat
+# these files as binary and thus will always conflict and require user
+# intervention with every merge. To do so, just uncomment the entries below
+###############################################################################
+#*.sln       merge=binary
+#*.csproj    merge=binary
+#*.vbproj    merge=binary
+#*.vcxproj   merge=binary
+#*.vcproj    merge=binary
+#*.dbproj    merge=binary
+#*.fsproj    merge=binary
+#*.lsproj    merge=binary
+#*.wixproj   merge=binary
+#*.modelproj merge=binary
+#*.sqlproj   merge=binary
+#*.wwaproj   merge=binary
+
+###############################################################################
+# behavior for image files
+#
+# image files are treated as binary by default.
+###############################################################################
+*.jpg   binary
+*.png   binary
+*.gif   binary
+
+###############################################################################
+# diff behavior for common document formats
+# 
+# Convert binary document formats to text before diffing them. This feature
+# is only available from the command line. Turn it on by uncommenting the 
+# entries below.
+###############################################################################
+#*.doc   diff=astextplain
+#*.DOC   diff=astextplain
+#*.docx  diff=astextplain
+#*.DOCX  diff=astextplain
+#*.dot   diff=astextplain
+#*.DOT   diff=astextplain
+#*.pdf   diff=astextplain
+#*.PDF   diff=astextplain
+#*.rtf   diff=astextplain
+#*.RTF   diff=astextplain
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,4 @@
+*DS_Store
+.vs/
+.vscode/
+.idea/
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2016 Stefan Prodan
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,363 @@
+Prometheus-Grafana
+========
+
+A monitoring solution for Docker hosts and containers with [Prometheus](https://prometheus.io/), [Grafana](http://grafana.org/), [cAdvisor](https://github.com/google/cadvisor),
+[NodeExporter](https://github.com/prometheus/node_exporter) and alerting with [AlertManager](https://github.com/prometheus/alertmanager).  
+
+This is a forked repository. So, you may want to visit the original repo at [stefanprodan
+/
+dockprom](https://github.com/stefanprodan/dockprom)
+
+Additional info: [Docker - Prometheus and Grafana](https://bogotobogo.com/DevOps/Docker/Docker_Prometheus_Grafana.php)
+
+## Install
+
+### Create .env:
+```
+ADMIN_USER=admin  
+ADMIN_PASSWORD=admin
+```
+
+### Clone this repository on your Docker host, cd into test directory and run compose up:
+
+```
+git clone https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana.git
+cd Docker-Compose-Prometheus-and-Grafana
+docker-compose up -d
+```
+
+## Prerequisites:
+
+* Docker Engine >= 1.13
+* Docker Compose >= 1.11
+
+## Containers:
+
+* Prometheus (metrics database) `http://<host-ip>:9090`
+* Prometheus-Pushgateway (push acceptor for ephemeral and batch jobs) `http://<host-ip>:9091`
+* AlertManager (alerts management) `http://<host-ip>:9093`
+* Grafana (visualize metrics) `http://<host-ip>:3000`
+* NodeExporter (host metrics collector)
+* cAdvisor (containers metrics collector)
+* Caddy (reverse proxy and basic auth provider for prometheus and alertmanager)
+
+## Setup Grafana
+
+Navigate to `http://<host-ip>:3000` and login with user ***admin*** password ***admin***. You can change the credentials in the compose file or by supplying the `ADMIN_USER` and `ADMIN_PASSWORD` environment variables via .env file on compose up. The config file can be added directly in grafana part like this
+```
+grafana:
+  image: grafana/grafana:5.2.4
+  env_file:
+    - config
+
+```
+and the config file format should have this content
+```
+GF_SECURITY_ADMIN_USER=admin
+GF_SECURITY_ADMIN_PASSWORD=changeme
+GF_USERS_ALLOW_SIGN_UP=false
+```
+If you want to change the password, you have to remove this entry, otherwise the change will not take effect
+```
+- grafana_data:/var/lib/grafana
+```
+
+Grafana is preconfigured with dashboards and Prometheus as the default data source:
+
+* Name: Prometheus
+* Type: Prometheus
+* Url: http://prometheus:9090
+* Access: proxy
+
+***Docker Host Dashboard***
+
+![Host](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Grafana_Docker_Host.png)
+
+The Docker Host Dashboard shows key metrics for monitoring the resource usage of your server:
+
+* Server uptime, CPU idle percent, number of CPU cores, available memory, swap and storage
+* System load average graph, running and blocked by IO processes graph, interrupts graph
+* CPU usage graph by mode (guest, idle, iowait, irq, nice, softirq, steal, system, user)
+* Memory usage graph by distribution (used, free, buffers, cached)
+* IO usage graph (read Bps, read Bps and IO time)
+* Network usage graph by device (inbound Bps, Outbound Bps)
+* Swap usage and activity graphs
+
+For storage and particularly Free Storage graph, you have to specify the fstype in grafana graph request.
+You can find it in `grafana/dashboards/docker_host.json`, at line 480 :
+
+      "expr": "sum(node_filesystem_free_bytes{fstype=\"btrfs\"})",
+
+I work on BTRFS, so i need to change `aufs` to `btrfs`.
+
+You can find right value for your system in Prometheus `http://<host-ip>:9090` launching this request :
+
+      node_filesystem_free_bytes
+
+***Docker Containers Dashboard***
+
+![Containers](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Grafana_Docker_Containers.png)
+
+The Docker Containers Dashboard shows key metrics for monitoring running containers:
+
+* Total containers CPU load, memory and storage usage
+* Running containers graph, system load graph, IO usage graph
+* Container CPU usage graph
+* Container memory usage graph
+* Container cached memory usage graph
+* Container network inbound usage graph
+* Container network outbound usage graph
+
+Note that this dashboard doesn't show the containers that are part of the monitoring stack.
+
+***Monitor Services Dashboard***
+
+![Monitor Services](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Grafana_Prometheus.png)
+![Monitor Services](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Grafana_Prometheus2.png)
+![Monitor Services](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Grafana_Prometheus3.png)
+
+The Monitor Services Dashboard shows key metrics for monitoring the containers that make up the monitoring stack:
+
+* Prometheus container uptime, monitoring stack total memory usage, Prometheus local storage memory chunks and series
+* Container CPU usage graph
+* Container memory usage graph
+* Prometheus chunks to persist and persistence urgency graphs
+* Prometheus chunks ops and checkpoint duration graphs
+* Prometheus samples ingested rate, target scrapes and scrape duration graphs
+* Prometheus HTTP requests graph
+* Prometheus alerts graph
+
+## Define alerts
+
+Three alert groups have been setup within the [alert.rules](https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/blob/master/prometheus/alert.rules) configuration file:
+
+* Monitoring services alerts [targets](https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/blob/master/prometheus/alert.rules#L2-L11)
+* Docker Host alerts [host](https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/blob/master/prometheus/alert.rules#L13-L40)
+* Docker Containers alerts [containers](https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/blob/master/prometheus/alert.rules#L42-L69)
+
+You can modify the alert rules and reload them by making a HTTP POST call to Prometheus:
+
+```
+curl -X POST http://admin:admin@<host-ip>:9090/-/reload
+```
+
+***Monitoring services alerts***
+
+Trigger an alert if any of the monitoring targets (node-exporter and cAdvisor) are down for more than 30 seconds:
+
+```yaml
+- alert: monitor_service_down
+    expr: up == 0
+    for: 30s
+    labels:
+      severity: critical
+    annotations:
+      summary: "Monitor service non-operational"
+      description: "Service {{ $labels.instance }} is down."
+```
+
+***Docker Host alerts***
+
+Trigger an alert if the Docker host CPU is under high load for more than 30 seconds:
+
+```yaml
+- alert: high_cpu_load
+    expr: node_load1 > 1.5
+    for: 30s
+    labels:
+      severity: warning
+    annotations:
+      summary: "Server under high load"
+      description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
+```
+
+Modify the load threshold based on your CPU cores.
+
+Trigger an alert if the Docker host memory is almost full:
+
+```yaml
+- alert: high_memory_load
+    expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
+    for: 30s
+    labels:
+      severity: warning
+    annotations:
+      summary: "Server memory is almost full"
+      description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
+```
+
+Trigger an alert if the Docker host storage is almost full:
+
+```yaml
+- alert: high_storage_load
+    expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"}  * 100 > 85
+    for: 30s
+    labels:
+      severity: warning
+    annotations:
+      summary: "Server storage is almost full"
+      description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
+```
+
+***Docker Containers alerts***
+
+Trigger an alert if a container is down for more than 30 seconds:
+
+```yaml
+- alert: jenkins_down
+    expr: absent(container_memory_usage_bytes{name="jenkins"})
+    for: 30s
+    labels:
+      severity: critical
+    annotations:
+      summary: "Jenkins down"
+      description: "Jenkins container is down for more than 30 seconds."
+```
+
+Trigger an alert if a container is using more than 10% of total CPU cores for more than 30 seconds:
+
+```yaml
+- alert: jenkins_high_cpu
+    expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10
+    for: 30s
+    labels:
+      severity: warning
+    annotations:
+      summary: "Jenkins high CPU usage"
+      description: "Jenkins CPU usage is {{ humanize $value}}%."
+```
+
+Trigger an alert if a container is using more than 1.2GB of RAM for more than 30 seconds:
+
+```yaml
+- alert: jenkins_high_memory
+    expr: sum(container_memory_usage_bytes{name="jenkins"}) > 1200000000
+    for: 30s
+    labels:
+      severity: warning
+    annotations:
+      summary: "Jenkins high memory usage"
+      description: "Jenkins memory consumption is at {{ humanize $value}}."
+```
+
+## Setup alerting
+
+The AlertManager service is responsible for handling alerts sent by Prometheus server.
+AlertManager can send notifications via email, Pushover, Slack, HipChat or any other system that exposes a webhook interface.
+A complete list of integrations can be found [here](https://prometheus.io/docs/alerting/configuration).
+
+You can view and silence notifications by accessing `http://<host-ip>:9093`.
+
+The notification receivers can be configured in [alertmanager/config.yml](https://github.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/blob/master/alertmanager/config.yml) file.
+
+To receive alerts via Slack you need to make a custom integration by choose ***incoming web hooks*** in your Slack team app page.
+You can find more details on setting up Slack integration [here](http://www.robustperception.io/using-slack-with-the-alertmanager/).
+
+Copy the Slack Webhook URL into the ***api_url*** field and specify a Slack ***channel***.
+
+```yaml
+route:
+    receiver: 'slack'
+
+receivers:
+    - name: 'slack'
+      slack_configs:
+          - send_resolved: true
+            text: "{{ .CommonAnnotations.description }}"
+            username: 'Prometheus'
+            channel: '#<channel>'
+            api_url: 'https://hooks.slack.com/services/<webhook-id>'
+```
+
+![Slack Notifications](https://raw.githubusercontent.com/Einsteinish/Docker-Compose-Prometheus-and-Grafana/master/screens/Slack_Notifications.png)
+
+## Sending metrics to the Pushgateway
+
+The [pushgateway](https://github.com/prometheus/pushgateway) is used to collect data from batch jobs or from services.
+
+To push data, simply execute:
+
+    echo "some_metric 3.14" | curl --data-binary @- http://user:password@localhost:9091/metrics/job/some_job
+
+Please replace the `user:password` part with your user and password set in the initial configuration (default: `admin:admin`).
+
+## Updating Grafana to v5.2.2
+
+[In Grafana versions >= 5.1 the id of the grafana user has been changed](http://docs.grafana.org/installation/docker/#migration-from-a-previous-version-of-the-docker-container-to-5-1-or-later). Unfortunately this means that files created prior to 5.1 won’t have the correct permissions for later versions.
+
+| Version |   User  | User ID |
+|:-------:|:-------:|:-------:|
+|  < 5.1  | grafana |   104   |
+|  \>= 5.1 | grafana |   472   |
+
+There are two possible solutions to this problem.
+- Change ownership from 104 to 472
+- Start the upgraded container as user 104
+
+##### Specifying a user in docker-compose.yml
+
+To change ownership of the files run your grafana container as root and modify the permissions.
+
+First perform a `docker-compose down` then modify your docker-compose.yml to include the `user: root` option:
+
+```
+  grafana:
+    image: grafana/grafana:5.2.2
+    container_name: grafana
+    volumes:
+      - grafana_data:/var/lib/grafana
+      - ./grafana/datasources:/etc/grafana/datasources
+      - ./grafana/dashboards:/etc/grafana/dashboards
+      - ./grafana/setup.sh:/setup.sh
+    entrypoint: /setup.sh
+    user: root
+    environment:
+      - GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin}
+      - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin}
+      - GF_USERS_ALLOW_SIGN_UP=false
+    restart: unless-stopped
+    expose:
+      - 3000
+    networks:
+      - monitor-net
+    labels:
+      org.label-schema.group: "monitoring"
+```
+
+Perform a `docker-compose up -d` and then issue the following commands:
+
+```
+docker exec -it --user root grafana bash
+
+# in the container you just started:
+chown -R root:root /etc/grafana && \
+chmod -R a+r /etc/grafana && \
+chown -R grafana:grafana /var/lib/grafana && \
+chown -R grafana:grafana /usr/share/grafana
+```
+
+To run the grafana container as `user: 104` change your `docker-compose.yml` like such:
+
+```
+  grafana:
+    image: grafana/grafana:5.2.2
+    container_name: grafana
+    volumes:
+      - grafana_data:/var/lib/grafana
+      - ./grafana/datasources:/etc/grafana/datasources
+      - ./grafana/dashboards:/etc/grafana/dashboards
+      - ./grafana/setup.sh:/setup.sh
+    entrypoint: /setup.sh
+    user: "104"
+    environment:
+      - GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin}
+      - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin}
+      - GF_USERS_ALLOW_SIGN_UP=false
+    restart: unless-stopped
+    expose:
+      - 3000
+    networks:
+      - monitor-net
+    labels:
+      org.label-schema.group: "monitoring"
+```
--- a/alertmanager/config.yml
+++ b/alertmanager/config.yml
@@ -0,0 +1,11 @@
+route:
+    receiver: 'slack'
+
+receivers:
+    - name: 'slack'
+      slack_configs:
+          - send_resolved: true
+            text: "{{ .CommonAnnotations.description }}"
+            username: 'Prometheus'
+            channel: '#prometheus'
+            api_url: 'https://hooks.slack.com/services/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'
--- a/caddy/Caddyfile
+++ b/caddy/Caddyfile
@@ -0,0 +1,39 @@
+:9090 {
+    basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD}
+    proxy / prometheus:9090 {
+            transparent
+        }
+
+    errors stderr
+    tls off
+}
+
+:9093 {
+    basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD}
+    proxy / alertmanager:9093 {
+            transparent
+        }
+
+    errors stderr
+    tls off
+}
+
+:9091 {
+    basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD}
+    proxy / pushgateway:9091 {
+            transparent
+        }
+
+    errors stderr
+    tls off
+}
+
+:3000 {
+    proxy / grafana:3000 {
+            transparent
+            websocket
+        }
+
+    errors stderr
+    tls off
+}
--- a/3
+++ b/3
@@ -0,0 +1,3 @@
+GF_SECURITY_ADMIN_USER=admin
+GF_SECURITY_ADMIN_PASSWORD=changeme
+GF_USERS_ALLOW_SIGN_UP=false
--- a/docker-compose.exporters.yml
+++ b/docker-compose.exporters.yml
@@ -0,0 +1,36 @@
+version: '2.1'
+
+services:
+
+  nodeexporter:
+    image: prom/node-exporter:v0.18.1
+    container_name: nodeexporter
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.rootfs=/rootfs'
+      - '--path.sysfs=/host/sys'
+      - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
+    restart: unless-stopped
+    network_mode: host
+    labels:
+      org.label-schema.group: "monitoring"
+
+  cadvisor:
+    image: gcr.io/google-containers/cadvisor:v0.34.0
+    container_name: cadvisor
+    volumes:
+      - /:/rootfs:ro
+      - /var/run:/var/run:rw
+      - /sys:/sys:ro
+      - /var/lib/docker/:/var/lib/docker:ro
+      - /cgroup:/cgroup:ro
+    restart: unless-stopped
+    network_mode: host
+    labels:
+      org.label-schema.group: "monitoring"
+  
+
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,133 @@
+version: '2.1'
+
+networks:
+  monitor-net:
+    driver: bridge
+
+volumes:
+    prometheus_data: {}
+    grafana_data: {}
+
+services:
+
+  prometheus:
+    image: prom/prometheus:v2.17.1
+    container_name: prometheus
+    volumes:
+      - ./prometheus:/etc/prometheus
+      - prometheus_data:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--web.console.libraries=/etc/prometheus/console_libraries'
+      - '--web.console.templates=/etc/prometheus/consoles'
+      - '--storage.tsdb.retention.time=200h'
+      - '--web.enable-lifecycle'
+    restart: unless-stopped
+    expose:
+      - 9090
+    networks:
+      - monitor-net
+    labels:
+      org.label-schema.group: "monitoring"
+
+  alertmanager:
+    image: prom/alertmanager:v0.20.0
+    container_name: alertmanager
+    volumes:
+      - ./alertmanager:/etc/alertmanager
+    command:
+      - '--config.file=/etc/alertmanager/config.yml'
+      - '--storage.path=/alertmanager'
+    restart: unless-stopped
+    expose:
+      - 9093
+    networks:
+      - monitor-net
+    labels:
+      org.label-schema.group: "monitoring"
+
+  nodeexporter:
+    image: prom/node-exporter:v0.18.1
+    container_name: nodeexporter
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.rootfs=/rootfs'
+      - '--path.sysfs=/host/sys'
+      - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
+    restart: unless-stopped
+    expose:
+      - 9100
+    networks:
+      - monitor-net
+    labels:
+      org.label-schema.group: "monitoring"
+
+  cadvisor:
+    image: gcr.io/google-containers/cadvisor:v0.34.0
+    container_name: cadvisor
+    volumes:
+      - /:/rootfs:ro
+      - /var/run:/var/run:rw
+      - /sys:/sys:ro
+      - /var/lib/docker:/var/lib/docker:ro
+      #- /cgroup:/cgroup:ro #doesn't work on MacOS only for Linux
+    restart: unless-stopped
+    expose:
+      - 8080
+    networks:
+      - monitor-net
+    labels:
+      org.label-schema.group: "monitoring"
+
+  grafana:
+    image: grafana/grafana:6.7.2
+    container_name: grafana
+    volumes:
+      - grafana_data:/var/lib/grafana
+      - ./grafana/provisioning:/etc/grafana/provisioning
+    environment:
+      - GF_SECURITY_ADMIN_USER=${ADMIN_USER}
+      - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD}
+      - GF_USERS_ALLOW_SIGN_UP=false
+    restart: unless-stopped
+    expose:
+      - 3000
+    networks:
+      - monitor-net
+    labels:
+      org.label-schema.group: "monitoring"
+
+  pushgateway:
+    image: prom/pushgateway:v1.2.0
+    container_name: pushgateway
+    restart: unless-stopped
+    expose:
+      - 9091
+    networks:
+      - monitor-net
+    labels:
+      org.label-schema.group: "monitoring"
+
+  caddy:
+    image: stefanprodan/caddy
+    container_name: caddy
+    ports:
+      - "3000:3000"
+      - "9090:9090"
+      - "9093:9093"
+      - "9091:9091"
+    volumes:
+      - ./caddy:/etc/caddy
+    environment:
+      - ADMIN_USER=${ADMIN_USER}
+      - ADMIN_PASSWORD=${ADMIN_PASSWORD}
+    restart: unless-stopped
+    networks:
+      - monitor-net
+    labels:
+      org.label-schema.group: "monitoring"
--- a/grafana/provisioning/dashboards/dashboard.yml
+++ b/grafana/provisioning/dashboards/dashboard.yml
@@ -0,0 +1,12 @@
+apiVersion: 1
+
+providers:
+  - name: 'Prometheus'
+    orgId: 1
+    folder: ''
+    type: file
+    disableDeletion: false
+    editable: true
+    allowUiUpdates: true
+    options:
+      path: /etc/grafana/provisioning/dashboards
--- a/grafana/provisioning/dashboards/docker_containers.json
+++ b/grafana/provisioning/dashboards/docker_containers.json
--- a/grafana/provisioning/dashboards/docker_host.json
+++ b/grafana/provisioning/dashboards/docker_host.json
--- a/grafana/provisioning/dashboards/monitor_services.json
+++ b/grafana/provisioning/dashboards/monitor_services.json
--- a/grafana/provisioning/dashboards/nginx_container.json
+++ b/grafana/provisioning/dashboards/nginx_container.json
@@ -0,0 +1,398 @@
+{
+  "id": null,
+  "title": "Nginx",
+  "description": "Nginx exporter metrics",
+  "tags": [
+    "nginx"
+  ],
+  "style": "dark",
+  "timezone": "browser",
+  "editable": true,
+  "hideControls": false,
+  "sharedCrosshair": true,
+  "rows": [
+    {
+      "collapse": false,
+      "editable": true,
+      "height": "250px",
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "datasource": "Prometheus",
+          "decimals": 2,
+          "editable": true,
+          "error": false,
+          "fill": 1,
+          "grid": {
+            "threshold1": null,
+            "threshold1Color": "rgba(216, 200, 27, 0.27)",
+            "threshold2": null,
+            "threshold2Color": "rgba(234, 112, 112, 0.22)"
+          },
+          "id": 3,
+          "isNew": true,
+          "legend": {
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
+            "rightSide": true,
+            "show": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 2,
+          "links": [],
+          "nullPointMode": "connected",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "span": 12,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(irate(nginx_connections_processed_total{stage=\"any\"}[5m])) by (stage)",
+              "hide": false,
+              "interval": "",
+              "intervalFactor": 10,
+              "legendFormat": "requests",
+              "metric": "",
+              "refId": "B",
+              "step": 10
+            }
+          ],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Requests/sec",
+          "tooltip": {
+            "msResolution": false,
+            "shared": true,
+            "sort": 0,
+            "value_type": "cumulative"
+          },
+          "type": "graph",
+          "xaxis": {
+            "show": true
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": 0,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "datasource": "Prometheus",
+          "decimals": 2,
+          "editable": true,
+          "error": false,
+          "fill": 1,
+          "grid": {
+            "threshold1": null,
+            "threshold1Color": "rgba(216, 200, 27, 0.27)",
+            "threshold2": null,
+            "threshold2Color": "rgba(234, 112, 112, 0.22)"
+          },
+          "id": 2,
+          "isNew": true,
+          "legend": {
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
+            "rightSide": true,
+            "show": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 2,
+          "links": [],
+          "nullPointMode": "connected",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "span": 12,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(nginx_connections_current) by (state)",
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "{{state}}",
+              "metric": "",
+              "refId": "A",
+              "step": 2
+            }
+          ],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Connections",
+          "tooltip": {
+            "msResolution": false,
+            "shared": true,
+            "sort": 0,
+            "value_type": "cumulative"
+          },
+          "type": "graph",
+          "xaxis": {
+            "show": true
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": 0,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "datasource": "Prometheus",
+          "decimals": 2,
+          "editable": true,
+          "error": false,
+          "fill": 1,
+          "grid": {
+            "threshold1": null,
+            "threshold1Color": "rgba(216, 200, 27, 0.27)",
+            "threshold2": null,
+            "threshold2Color": "rgba(234, 112, 112, 0.22)"
+          },
+          "id": 1,
+          "isNew": true,
+          "legend": {
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
+            "rightSide": true,
+            "show": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 2,
+          "links": [],
+          "nullPointMode": "connected",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "span": 12,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(irate(nginx_connections_processed_total{stage!=\"any\"}[5m])) by (stage)",
+              "hide": false,
+              "interval": "",
+              "intervalFactor": 10,
+              "legendFormat": "{{stage}}",
+              "metric": "",
+              "refId": "B",
+              "step": 10
+            }
+          ],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Connections rate",
+          "tooltip": {
+            "msResolution": false,
+            "shared": true,
+            "sort": 0,
+            "value_type": "cumulative"
+          },
+          "type": "graph",
+          "xaxis": {
+            "show": true
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": 0,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        }
+      ],
+      "title": "Nginx exporter metrics"
+    },
+    {
+      "collapse": false,
+      "editable": true,
+      "height": "250px",
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "datasource": null,
+          "editable": true,
+          "error": false,
+          "fill": 1,
+          "grid": {
+            "threshold1": null,
+            "threshold1Color": "rgba(216, 200, 27, 0.27)",
+            "threshold2": null,
+            "threshold2Color": "rgba(234, 112, 112, 0.22)"
+          },
+          "id": 4,
+          "isNew": true,
+          "legend": {
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
+            "rightSide": true,
+            "show": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 2,
+          "links": [],
+          "nullPointMode": "connected",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "span": 12,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu_seconds_total{mode=\"system\"}) * 100",
+              "intervalFactor": 2,
+              "legendFormat": "nginx",
+              "refId": "A",
+              "step": 2
+            }
+          ],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "CPU usage",
+          "tooltip": {
+            "msResolution": false,
+            "shared": true,
+            "sort": 0,
+            "value_type": "cumulative"
+          },
+          "type": "graph",
+          "xaxis": {
+            "show": true
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        }
+      ],
+      "title": "Nginx container metrics"
+    }
+  ],
+  "time": {
+    "from": "now-15m",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+    "templating": {
+    "list": []
+  },
+  "annotations": {
+    "list": []
+  },
+  "refresh": "10s",
+  "schemaVersion": 12,
+  "version": 9,
+  "links": [],
+  "gnetId": null
+}
--- a/grafana/provisioning/datasources/datasource.yml
+++ b/grafana/provisioning/datasources/datasource.yml
@@ -0,0 +1,11 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    orgId: 1
+    url: http://prometheus:9090
+    basicAuth: false
+    isDefault: true
+    editable: true
--- a/helpers/aws/README.md
+++ b/helpers/aws/README.md
@@ -0,0 +1,22 @@
+# Prometheus on EC2 & ECS:
+
+Some helpers for anyone configuring Prometheus on ECS and AWS EC2.
+
+To get started on AWS ECS and EC2:
+
+*For EC2/ECS nodes*:
+- Import the ecs task definition and add cadvisor and node-exporter service/task definition and run them on each host you want to be monitored
+- Any hosts which have "Monitoring: On" tag will be automatically added in the targets
+- Expose ports 9100 and 9191 to your Prometheus host
+
+*For Prometheus host*:
+
+- Copy prometheus.yml configuration present here to base prometheus configuration to enable EC2 service discovery
+- `docker compose up -d`
+
+**Note**:
+Set query.staleness-delta to 1m make metrics more realtime
+
+
+### TODO
+- Add alerting rules based on ECS
--- a/helpers/aws/cadvisor_ecs_task_definition.json
+++ b/helpers/aws/cadvisor_ecs_task_definition.json
@@ -0,0 +1,78 @@
+{
+    "family": "cadvisor",
+    "containerDefinitions": [
+        {
+            "name": "cadvisor",
+            "image": "google/cadvisor",
+            "cpu": 10,
+            "memory": 300,
+            "portMappings": [
+                {
+                    "containerPort": 9191,
+                    "hostPort": 9191
+                }
+            ],
+            "essential": true,
+            "privileged": true,
+            "mountPoints": [
+                {
+                    "sourceVolume": "root",
+                    "containerPath": "/rootfs",
+                    "readOnly": true
+                },
+                {
+                    "sourceVolume": "var_run",
+                    "containerPath": "/var/run",
+                    "readOnly": false
+                },
+                {
+                    "sourceVolume": "sys",
+                    "containerPath": "/sys",
+                    "readOnly": true
+                },
+                {
+                    "sourceVolume": "var_lib_docker",
+                    "containerPath": "/var/lib/docker",
+                    "readOnly": true
+                },
+                {
+                    "sourceVolume": "cgroup",
+                    "containerPath": "/cgroup",
+                    "readOnly": true
+                }
+            ]
+        }
+    ],
+    "volumes": [
+        {
+            "name": "root",
+            "host": {
+                "sourcePath": "/"
+            }
+        },
+        {
+            "name": "var_run",
+            "host": {
+                "sourcePath": "/var/run"
+            }
+        },
+        {
+            "name": "sys",
+            "host": {
+                "sourcePath": "/sys"
+            }
+        },
+        {
+            "name": "var_lib_docker",
+            "host": {
+                "sourcePath": "/var/lib/docker/"
+            }
+        },
+        {
+            "name": "cgroup",
+            "host": {
+                "sourcePath": "/cgroup"
+            }
+        }
+    ]
+}
--- a/helpers/aws/node_exporter_task_definition.json
+++ b/helpers/aws/node_exporter_task_definition.json
@@ -0,0 +1,22 @@
+{
+    "family": "prometheus",
+    "containerDefinitions": [
+        {
+            "portMappings": [
+                {
+                    "hostPort": 9100,
+                    "containerPort": 9100,
+                    "protocol": "tcp"
+                }
+            ],
+            "essential": true,
+            "name": "node_exporter",
+            "image": "prom/node-exporter",
+            "cpu": 0,
+            "privileged": null,
+            "memoryReservation": 150
+        }
+    ],
+    "volumes": [],
+    "networkMode": "host"
+}
--- a/helpers/aws/prometheus.yml
+++ b/helpers/aws/prometheus.yml
@@ -0,0 +1,53 @@
+global:
+  scrape_interval:     15s
+  evaluation_interval: 15s
+
+  # Attach these labels to any time series or alerts when communicating with
+  # external systems (federation, remote storage, Alertmanager).
+  external_labels:
+      monitor: 'docker-host-alpha'
+
+# Load and evaluate rules in this file every 'evaluation_interval' seconds.
+rule_files:
+  - "targets.rules"
+  - "hosts.rules"
+  - "containers.rules"
+
+# A scrape configuration containing exactly one endpoint to scrape.
+scrape_configs:
+  - job_name: 'nodeexporter'
+    scrape_interval: 5s
+    static_configs:
+      - targets: ['nodeexporter:9100']
+
+  - job_name: 'cadvisor'
+    scrape_interval: 5s
+    static_configs:
+      - targets: ['cadvisor:8080']
+
+  - job_name: 'prometheus'
+    scrape_interval: 10s
+    static_configs:
+      - targets: ['localhost:9090']
+
+
+# sample scrape configuration for AWS EC2
+  - job_name: 'nodeexporter'
+    ec2_sd_configs:
+      - region: us-east-1
+        port: 9100
+    relabel_configs:
+        # Only monitor instances which have a tag called Monitoring "Monitoring"
+      - source_labels: [__meta_ec2_tag_Monitoring]
+        regex: On
+        action: keep
+
+  - job_name: 'cadvisor'
+    ec2_sd_configs:
+      - region: us-east-1
+        port: 9010
+    relabel_configs:
+        # Only monitor instances which have a tag called Monitoring "Monitoring"
+      - source_labels: [__meta_ec2_tag_Monitoring]
+        regex: On
+        action: keep
--- a/prometheus/alert.rules
+++ b/prometheus/alert.rules
@@ -0,0 +1,70 @@
+groups:
+- name: targets
+  rules:
+  - alert: monitor_service_down
+    expr: up == 0
+    for: 30s
+    labels:
+      severity: critical
+    annotations:
+      summary: "Monitor service non-operational"
+      description: "Service {{ $labels.instance }} is down."
+
+- name: host
+  rules:
+  - alert: high_cpu_load
+    expr: node_load1 > 1.5
+    for: 30s
+    labels:
+      severity: warning
+    annotations:
+      summary: "Server under high load"
+      description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
+
+  - alert: high_memory_load
+    expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
+    for: 30s
+    labels:
+      severity: warning
+    annotations:
+      summary: "Server memory is almost full"
+      description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
+
+  - alert: high_storage_load
+    expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"}  * 100 > 85
+    for: 30s
+    labels:
+      severity: warning
+    annotations:
+      summary: "Server storage is almost full"
+      description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
+
+- name: containers
+  rules:
+  - alert: jenkins_down
+    expr: absent(container_memory_usage_bytes{name="jenkins"})
+    for: 30s
+    labels:
+      severity: critical
+    annotations:
+      summary: "Jenkins down"
+      description: "Jenkins container is down for more than 30 seconds."
+
+  - alert: jenkins_high_cpu
+    expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10
+    for: 30s
+    labels:
+      severity: warning
+    annotations:
+      summary: "Jenkins high CPU usage"
+      description: "Jenkins CPU usage is {{ humanize $value}}%."
+
+  - alert: jenkins_high_memory
+    expr: sum(container_memory_usage_bytes{name="jenkins"}) > 1200000000
+    for: 30s
+    labels:
+      severity: warning
+    annotations:
+      summary: "Jenkins high memory usage"
+      description: "Jenkins memory consumption is at {{ humanize $value}}."
+
--- a/prometheus/prometheus.yml
+++ b/prometheus/prometheus.yml
@@ -0,0 +1,53 @@
+global:
+  scrape_interval:     15s
+  evaluation_interval: 15s
+
+  # Attach these labels to any time series or alerts when communicating with
+  # external systems (federation, remote storage, Alertmanager).
+  external_labels:
+      monitor: 'docker-host-alpha'
+
+# Load and evaluate rules in this file every 'evaluation_interval' seconds.
+rule_files:
+  - "alert.rules"
+
+# A scrape configuration containing exactly one endpoint to scrape.
+scrape_configs:
+  - job_name: 'nodeexporter'
+    scrape_interval: 5s
+    static_configs:
+      - targets: ['nodeexporter:9100']
+
+  - job_name: 'cadvisor'
+    scrape_interval: 5s
+    static_configs:
+      - targets: ['cadvisor:8080']
+
+  - job_name: 'prometheus'
+    scrape_interval: 10s
+    static_configs:
+      - targets: ['localhost:9090']
+
+  - job_name: 'pushgateway'
+    scrape_interval: 10s
+    honor_labels: true
+    static_configs:
+      - targets: ['pushgateway:9091']
+
+
+alerting:
+  alertmanagers:
+  - scheme: http
+    static_configs:
+    - targets: 
+      - 'alertmanager:9093'
+
+#  - job_name: 'nginx'
+#    scrape_interval: 10s
+#    static_configs:
+#      - targets: ['nginxexporter:9113']
+
+#  - job_name: 'aspnetcore'
+#    scrape_interval: 10s
+#    static_configs:
+#      - targets: ['eventlog-proxy:5000', 'eventlog:5000']
--- a/screens/Grafana_Docker_Containers.png
+++ b/screens/Grafana_Docker_Containers.png
--- a/screens/Grafana_Docker_Host.png
+++ b/screens/Grafana_Docker_Host.png
--- a/screens/Grafana_Prometheus.png
+++ b/screens/Grafana_Prometheus.png
--- a/screens/Grafana_Prometheus2.png
+++ b/screens/Grafana_Prometheus2.png
--- a/screens/Grafana_Prometheus3.png
+++ b/screens/Grafana_Prometheus3.png
--- a/screens/Slack_Notifications.png
+++ b/screens/Slack_Notifications.png