Skip to content

Commit

Permalink
Various fixes
Browse files Browse the repository at this point in the history
formatting some files with goimports
formatting multiline yaml better in example config
logging fix in handler
Dockerfile will build smaller binaries
  • Loading branch information
lucasreed committed Jan 3, 2020
1 parent bfa2248 commit 3649023
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 40 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ LABEL maintainer="Luke Reed <[email protected]>"
WORKDIR /go/src/github.com/fairwindsops/astro
ADD . /go/src/github.com/fairwindsops/astro

RUN GO111MODULE=on GOOS=linux GOARCH=amd64 go build
RUN GO111MODULE=on GOOS=linux GOARCH=amd64 go build -ldflags "-s -w"


FROM gcr.io/distroless/base
Expand Down
55 changes: 22 additions & 33 deletions conf-example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,14 @@ rulesets:
name: "Deployment Replica Alert - {{ .ObjectMeta.Name }}"
type: metric alert
query: "max(last_10m):max:kubernetes_state.deployment.replicas_available{namespace:{{ .ObjectMeta.Namespace }}} by {deployment} <= 0"
message: |
message: |-
{{ "{{#is_alert}}" }}
Available replicas is currently 0 for {{ .ObjectMeta.Name }}
{{ "{{/is_alert}}" }}
{{ "{{^is_alert}}" }}
Available replicas is no longer 0 for {{ .ObjectMeta.Name }}
{{ "{{/is_alert}}" }}
tags:
- astro
tags: []
options:
no_data_timeframe: 60
notify_audit: false
Expand All @@ -44,16 +43,15 @@ rulesets:
name: "Deployment Replica Alert - {{ .ObjectMeta.Name }}"
type: metric alert
query: "max(last_10m):max:kubernetes_state.deployment.replicas_available{deployment:{{ .ObjectMeta.Name }}} <= 0"
message: |
message: |-
{{ "{{#is_alert}}" }}
Available replicas is currently 0 for {{ "{{deployment.name}}" }}
{{ "{{/is_alert}}" }}
{{ "{{^is_alert}}" }}
Available replicas is no longer 0 for {{ "{{deployment.name}}" }}
{{ "{{/is_alert}}" }}
{{ ClusterVariables.warning_notifications }}
tags:
- astro
tags: []
options:
notify_audit: false
notify_no_data: false
Expand All @@ -70,13 +68,12 @@ rulesets:
name: "High System Load Average"
type: metric alert
query: "avg(last_30m):avg:system.load.norm.5{k8s.io/role/master:1} by {host} > 2"
message: |
message: |-
Load average is high on {{ "{{host.name}} {{host.ip}}" }}.
This is a normalized load based on the number of CPUs (i.e. ActualLoadAverage / NumberOfCPUs)
Is this node over-provisioned? Pods may need to have a CPU limits closer to their requests
Is this node doing a lot of I/O? Load average could be high based on high disk or networking I/O. This may be acceptable if application performance is still ok. To reduce I/O-based system load, you may need to artificially limit the number of high-I/O pods running on a single node.
tags:
- astro
tags: []
options:
notify_audit: false
notify_no_data: false
Expand All @@ -88,7 +85,7 @@ rulesets:
name: "Memory Utilization"
type: query alert
query: "avg(last_15m):avg:system.mem.pct_usable{k8s.io/role/master:1} by {host} < 0.1"
message: |
message: |-
{{ "{{#is_alert}}" }}
Running out of free memory on {{ "{{host.name}}" }}
{{ "{{/is_alert}}" }}
Expand All @@ -98,8 +95,7 @@ rulesets:
{{ "{{#is_alert_recovery}}" }}
Memory is below treshold again
{{ "{{/is_alert_recovery}}" }}
tags:
- astro
tags: []
options:
notify_audit: false
notify_no_data: false
Expand All @@ -113,7 +109,7 @@ rulesets:
name: "Pending Pods"
type: metric alert
query: "min(last_30m):sum:kubernetes_state.pod.status_phase{phase:running} - sum:kubernetes_state.pod.status_phase{phase:running} + sum:kubernetes_state.pod.status_phase{phase:pending}.fill(zero) >= 1"
message: |
message: |-
{{ "{{#is_alert}}" }}
There has been at least 1 pod Pending for 30 minutes.
There are currently {{ "{{value}}" }} pods Pending.
Expand All @@ -124,8 +120,7 @@ rulesets:
{{ "{{^is_alert}}" }}
Pods are no longer pending.
{{ "{{/is_alert}}" }}
tags:
- astro
tags: []
options:
notify_audit: false
notify_no_data: false
Expand All @@ -137,7 +132,7 @@ rulesets:
name: "Host Disk Usage"
type: metric alert
query: "avg(last_30m):(avg:system.disk.total{*} by {host} - avg:system.disk.free{*} by {host}) / avg:system.disk.total{*} by {host} * 100 > 90"
message: |
message: |-
{{ "{{#is_alert}}" }}
Disk Usage has been above threshold over 30 minutes on {{ "{{host.name}}" }}
{{ "{{/is_alert}}" }}
Expand All @@ -150,8 +145,7 @@ rulesets:
{{ "{{^is_warning}}" }}
Disk Usage has recovered on {{ "{{host.name}}" }}
{{ "{{/is_warning}}" }}
tags:
- astro
tags: []
options:
notify_audit: false
notify_no_data: false
Expand All @@ -167,15 +161,14 @@ rulesets:
name: "HPA Errors"
type: event alert
query: "events('sources:kubernetes priority:all \"unable to fetch metrics from resource metrics API:\"').by('hpa').rollup('count').last('1h') > 200"
message: |
message: |-
{{ "{{#is_alert}}" }}
A high number of hpa failures (> {{ "{{threshold}}" }} ) are occurring. Can HPAs get metrics?
{{ "{{/is_alert}}" }}
{{ "{{#is_alert_recovery}}" }}
HPA Metric Retrieval Failure has recovered.
{{ "{{/is_alert_recovery}}" }}
tags:
- astro
tags: []
options:
notify_audit: false
notify_no_data: false
Expand All @@ -185,7 +178,7 @@ rulesets:
name: "I/O Wait Times"
type: metric alert
query: "avg(last_10m):avg:system.cpu.iowait{*} by {host} > 50"
message: |
message: |-
{{ "{{#is_alert}}" }}
The I/O wait time for {host.ip} is very high
- Is the EBS volume out of burst capacity for iops?
Expand All @@ -195,8 +188,7 @@ rulesets:
{{ "{{^is_alert}}" }}
The EBS volume burst capacity is returning to normal.
{{ "{{/is_alert}}" }}
tags:
- astro
tags: []
options:
notify_audit: false
new_host_delay: 300
Expand All @@ -210,15 +202,14 @@ rulesets:
name: "Nginx Config Reload Failure"
type: metric alert
query: "max(last_5m):max:ingress.nginx_ingress_controller_config_last_reload_successful{*} by {kube_deployment} <= 0"
message: |
message: |-
{{ "{{#is_alert}}" }}
The last nginx config reload for {{ "{{kube_deployment.name}}" }} failed! Are there any bad ingress configs? Does the nginx config have a syntax error?
{{ "{{/is_alert}}" }}
{{ "{{#is_recovery}}" }}
Nginx config reloaded successfully!
{{ "{{/is_recovery}}" }}
tags:
- astro
tags: []
options:
notify_audit: false
new_host_delay: 300
Expand All @@ -233,7 +224,7 @@ rulesets:
type: service check
query: |
"kubernetes_state.node.ready".by("host").last(20).count_by_status()
message: |
message: |-
{{ "{{#is_alert}}" }}
A Node is not ready!
Cluster: {{ "{{kubernetescluster.name}}" }}
Expand All @@ -247,8 +238,7 @@ rulesets:
Host: {{ "{{host.name}}" }}
IP: {{ "{{host.ip}}" }}
{{ "{{/is_recovery}}" }}
tags:
- astro
tags: []
options:
notify_audit: false
no_data_timeframe: 2
Expand All @@ -267,15 +257,14 @@ rulesets:
name: "Increased Pod Crashes - {{ .ObjectMeta.Name }}"
type: query alert
query: "avg(last_5m):avg:kubernetes_state.container.restarts{namespace:{{ .ObjectMeta.Name }}} by {pod} - hour_before(avg:kubernetes_state.container.restarts{namespace:{{ .ObjectMeta.Name }}} by {pod}) > 3"
message: |
message: |-
{{ "{{#is_alert}}" }}
{{ "{{pod.name}}" }} has crashed repeatedly over the last hour
{{ "{{/is_alert}}" }}
{{ "{{^is_alert}}" }}
{{ "{{pod.name}}" }} appears to have stopped crashing
{{ "{{/is_alert}}" }}
tags:
- astro
tags: []
options:
notify_audit: false
notify_no_data: false
Expand Down
10 changes: 6 additions & 4 deletions pkg/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ import (
"testing"
"time"

"github.com/fairwindsops/astro/pkg/kube"
"github.com/sirupsen/logrus"
"github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/assert"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -18,6 +14,12 @@ import (
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/util/workqueue"

"github.com/sirupsen/logrus"
"github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/assert"

"github.com/fairwindsops/astro/pkg/kube"
)

func TestCreateDeploymentController(t *testing.T) {
Expand Down
6 changes: 5 additions & 1 deletion pkg/datadog/test_helpers.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
package datadog

// If running mockgen on this package, you will unfortunately need to comment everything out below beforehand
// and then uncomment again afterwards

import (
"os"

mocks "github.com/fairwindsops/astro/pkg/mocks"
"github.com/golang/mock/gomock"

mocks "github.com/fairwindsops/astro/pkg/mocks"
)

// GetMock will return a mock datadog client API
Expand Down
2 changes: 1 addition & 1 deletion pkg/handler/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ import (
// obj is the Kubernetes object that was updated.
// event is the Event metadata representing the update.
func OnUpdate(obj interface{}, event config.Event) {
log.Infof("Handler got an OnUpdate event of type %s", event.EventType)
log.Infof("Handler got an OnUpdate event of type %s", event.ResourceType)

if event.EventType == "delete" {
onDelete(event)
Expand Down

0 comments on commit 3649023

Please sign in to comment.