In this example, we will implement the ATLAS statistical anomaly detector using SENTINL
Our situation:
- We´ve an varnish-cache server as Frontend-LB and caching Proxy
- The backends are selected based on their
first_url_part
- Backends are dynamically added or removed by our development teams (even new applications)
If we look at the 95th percentile of our consolidated backend runtimes we can´t see problems of a special backend service. If we draw a graph for every service, it will be to much to see a Problem.
To solve this, we will implement the atlas algorithm:
Here is a timelion screeshot of a Loadbalancer Problem:
How to do this? We need two watchers:
- First the one to collect a most surprising
req_runtime
of every backend for every hour - The second watcher iterates every 5 minute over the atlas index to find anomalies to report
First Watcher
This watcher will collect a most surprising req_runtime
of every backend for every hour, and insert any results in the atlas index (using webhook
and _bulk
)
{
"_index": "watcher",
"_type": "watch",
"_id": "surprise",
"_score": 1,
"_source": {
"trigger": {
"schedule": {
"later": "every 1 hours"
}
},
"input": {
"search": {
"request": {
"index": "public-front-*",
"body": {
"query": {
"filtered": {
"filter": {
"range": {
"@timestamp": {
"gte": "now-24h"
}
}
}
}
},
"size": 0,
"aggs": {
"metrics": {
"terms": {
"field": "first_url_part"
},
"aggs": {
"queries": {
"terms": {
"field": "backend"
},
"aggs": {
"series": {
"date_histogram": {
"field": "@timestamp",
"interval": "hour"
},
"aggs": {
"avg": {
"avg": {
"script": "doc['req_runtime'].value*1000",
"lang": "expression"
}
},
"movavg": {
"moving_avg": {
"buckets_path": "avg",
"window": 24,
"model": "simple"
}
},
"surprise": {
"bucket_script": {
"buckets_path": {
"avg": "avg",
"movavg": "movavg"
},
"script": {
"file": "surprise",
"lang": "groovy"
}
}
}
}
},
"largest_surprise": {
"max_bucket": {
"buckets_path": "series.surprise"
}
}
}
},
"ninetieth_surprise": {
"percentiles_bucket": {
"buckets_path": "queries>largest_surprise",
"percents": [
90.01
]
}
}
}
}
}
}
}
}
},
"condition": {
"script": {
"script": "payload.hits.total > 1"
}
},
"transform": {
"script": {
"script": "payload.aggregations.metrics.buckets.forEach(function(e){ e.ninetieth_surprise.value = e.ninetieth_surprise.values['90.01']; e.newts = new Date().toJSON(); })"
}
},
"actions": {
"ES_bulk_request": {
"throttle_period": "1m",
"webhook": {
"method": "POST",
"host": "myhost",
"port": 80,
"path": "/_bulk",
"body": "{{#payload.aggregations.metrics.buckets}}{\"index\":{\"_index\":\"atlas\", \"_type\":\"data\"}}\n{\"metric\":\"{{key}}\", \"value\":{{ninetieth_surprise.value}}, \"execution_time\":\"{{newts}}\"}\n{{/payload.aggregations.metrics.buckets}}",
"headers": {
"content-type": "text/plain; charset=ISO-8859-1"
}
}
}
}
}
}
The transform script makes the 90th suprise value of every buckes accessible for mustache and generates a NOW timestamp. The action writes the relevant values back to a seperate index named atlas.
Second Watcher
The second watcher iterates every 5 minutes over the atlas index to find anomalies to report:
{
"_index": "watcher",
"_type": "watch",
"_id": "check_surprise",
"_score": 1,
"_source": {
"trigger": {
"schedule": {
"later": "every 5 minutes"
}
},
"input": {
"search": {
"request": {
"index": "atlas",
"body": {
"query": {
"filtered": {
"filter": {
"range": {
"execution_time": {
"gte": "now-6h"
}
}
}
}
},
"size": 0,
"aggs": {
"metrics": {
"terms": {
"field": "metric"
},
"aggs": {
"series": {
"date_histogram": {
"field": "execution_time",
"interval": "hour"
},
"aggs": {
"avg": {
"avg": {
"field": "value"
}
}
}
},
"series_stats": {
"extended_stats": {
"field": "value",
"sigma": 3
}
}
}
}
}
}
}
}
},
"condition": {
"script": {
"script": "var status=false;payload.aggregations.metrics.buckets.forEach(function(e){ var std_upper=parseFloat(e.series_stats.std_deviation_bounds.upper); var avg=parseFloat(JSON.stringify(e.series.buckets.slice(-1)[0].avg.value)); if(isNaN(std_upper)||isNaN(avg)) {return status;}; if(avg > std_upper) {status=true; return status;};});status;"
}
},
"transform": {
"script": {
"script": "var alerts=[];payload.payload.aggregations.metrics.buckets.forEach(function(e){ var std_upper=parseFloat(e.series_stats.std_deviation_bounds.upper); var avg=parseFloat(JSON.stringify(e.series.buckets.slice(-1)[0].avg.value)); if(isNaN(std_upper)||isNaN(avg)) {return false;}; if(avg > std_upper) {alerts.push(e.key)};}); payload.alerts=alerts"
}
},
"actions": {
"series_alarm": {
"throttle_period": "15m",
"email": {
"to": "alarms@email.com",
"from": "sentinl@localhost",
"subject": "ATLAS ALARM Varnish_first_url_part",
"priority": "high",
"body": "there is an alarm for the following Varnish_first_url_parts:{{#alerts}}{{.}}<br>{{/alerts}}"
}
}
}
}
}
The condition script tests whether the average runtime of the last bucket is grater than upper bound of the `` std_dev
.
The transform script does something similar an alerts array at the top of the payload. At the and we alert per email (or REST POST, etc)
Credits
Thanks to Christian (@cherweg) for contributing his examples for the community