Skip to content

Commit

Permalink
Adding instrumentation around schedule lock to diagnose bottlenecks.
Browse files Browse the repository at this point in the history
  • Loading branch information
Craig Peterson committed Jul 1, 2015
1 parent ea721a2 commit e6a449b
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 19 deletions.
4 changes: 2 additions & 2 deletions cmd/bosun/sched/bolt.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ var savePending bool

func (s *Schedule) Save() {
go func() {
s.Lock()
s.Lock("Save")
defer s.Unlock()
if savePending {
return
Expand Down Expand Up @@ -124,7 +124,7 @@ func (s *Schedule) save() {
func (s *Schedule) RestoreState() error {
log.Println("RestoreState")
start := time.Now()
s.Lock()
s.Lock("RestoreState")
defer s.Unlock()
s.Search.Lock()
defer s.Search.Unlock()
Expand Down
10 changes: 5 additions & 5 deletions cmd/bosun/sched/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ func NewStatus(ak expr.AlertKey) *State {
}

func (s *Schedule) GetStatus(ak expr.AlertKey) *State {
s.Lock()
s.Lock("GetStatus")
state := s.status[ak]
s.Unlock()
return state
}

func (s *Schedule) GetOrCreateStatus(ak expr.AlertKey) *State {
s.Lock()
s.Lock("GetOrCreateStatus")
state := s.status[ak]
if state == nil {
state = NewStatus(ak)
Expand Down Expand Up @@ -92,7 +92,7 @@ func (s *Schedule) NewRunHistory(start time.Time, cache *cache.Cache) *RunHistor
func (s *Schedule) RunHistory(r *RunHistory) {
checkNotify := false
silenced := s.Silenced()
s.Lock()
s.Lock("RunHistory")
defer s.Unlock()
for ak, event := range r.Events {
state := s.status[ak]
Expand Down Expand Up @@ -345,7 +345,7 @@ func (r *RunHistory) GetUnknownAndUnevaluatedAlertKeys(alert string) (unknown, u
}
}
if !anyFound {
r.schedule.Lock()
r.schedule.Lock("GetUnknownUneval")
for ak, st := range r.schedule.status {
if ak.Name() != alert {
continue
Expand Down Expand Up @@ -386,7 +386,7 @@ func (s *Schedule) findUnknownAlerts(now time.Time) []expr.AlertKey {
if time.Now().Sub(bosunStartupTime) < s.Conf.CheckFrequency {
return keys
}
s.Lock()
s.Lock("FindUnknown")
for ak, st := range s.status {
if st.Forgotten || st.Status() == StError {
continue
Expand Down
2 changes: 1 addition & 1 deletion cmd/bosun/sched/notify.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func (s *Schedule) Notify(st *State, n *conf.Notification) {
// duration until the soonest notification triggers.
func (s *Schedule) CheckNotifications() time.Duration {
silenced := s.Silenced()
s.Lock()
s.Lock("CheckNotifications")
defer s.Unlock()
notifications := s.Notifications
s.Notifications = nil
Expand Down
44 changes: 36 additions & 8 deletions cmd/bosun/sched/sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ func init() {
}

type Schedule struct {
sync.Mutex
mutex sync.Mutex
mutexHolder string
mutexAquired time.Time
mutexWaitTime int64

Conf *conf.Conf
status States
Expand All @@ -49,10 +52,30 @@ type Schedule struct {
db *bolt.DB
}

func (s *Schedule) TimeLock(t miniprofiler.Timer) {
t.Step("lock", func(t miniprofiler.Timer) {
s.Lock()
})
func init() {
metadata.AddMetricMeta(
"bosun.schedule.lock_time", metadata.Counter, metadata.MilliSecond,
"Length of time spent waiting for or holding the schedule lock.")
metadata.AddMetricMeta(
"bosun.schedule.lock_count", metadata.Counter, metadata.Count,
"Number of times the given caller acquired the lock.")
}

func (s *Schedule) Lock(method string) {
start := time.Now()
s.mutex.Lock()
s.mutexAquired = time.Now()
s.mutexWaitTime = int64(s.mutexAquired.Sub(start) / time.Millisecond) // remember this so we don't have to call put until we leave the critical section.
}

func (s *Schedule) Unlock() {
holder := s.mutexHolder
start := s.mutexAquired
waitTime := s.mutexWaitTime
s.mutex.Unlock()
collect.Add("bosun.schedule.lock_time", opentsdb.TagSet{"caller": holder, op: "wait"}, waitTime)
collect.Add("bosun.schedule.lock_time", opentsdb.TagSet{"caller": holder, op: "hold"}, int64(time.Since(start)/time.Millisecond))
collect.Add("bosun.schedule.lock_count", opentsdb.TagSet{"caller": holder}, 1)
}

type Metavalue struct {
Expand Down Expand Up @@ -260,11 +283,16 @@ type StateGroups struct {
}

func (s *Schedule) MarshalGroups(T miniprofiler.Timer, filter string) (*StateGroups, error) {
silenced := s.Silenced()
var silenced map[expr.AlertKey]Silence
T.Step("Silenced", func(miniprofiler.Timer) {
silenced = s.Silenced()
})
t := StateGroups{
TimeAndDate: s.Conf.TimeAndDate,
}
s.TimeLock(T)
T.Step("lock", func(t miniprofiler.Timer) {
s.Lock("MarshalGroups")
})
defer s.Unlock()
status := make(States)
matches, err := makeFilter(filter)
Expand Down Expand Up @@ -555,7 +583,7 @@ func (s *State) Action(user, message string, t ActionType, timestamp time.Time)
}

func (s *Schedule) Action(user, message string, t ActionType, ak expr.AlertKey) error {
s.Lock()
s.Lock("Action")
defer func() {
s.Unlock()
s.Save()
Expand Down
6 changes: 3 additions & 3 deletions cmd/bosun/sched/silence.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func (s Silence) ID() string {
func (s *Schedule) Silenced() map[expr.AlertKey]Silence {
aks := make(map[expr.AlertKey]Silence)
now := time.Now()
s.Lock()
s.Lock("Silenced")
for _, si := range s.Silence {
for ak := range s.status {
if si.Silenced(now, ak.Name(), ak.Group()) {
Expand Down Expand Up @@ -108,7 +108,7 @@ func (s *Schedule) AddSilence(start, end time.Time, alert, tagList string, forge
}
si.Tags = tags
}
s.Lock()
s.Lock("AddSilence")
defer s.Unlock()
if confirm {
delete(s.Silence, edit)
Expand All @@ -126,7 +126,7 @@ func (s *Schedule) AddSilence(start, end time.Time, alert, tagList string, forge
}

func (s *Schedule) ClearSilence(id string) error {
s.Lock()
s.Lock("ClearSilence")
delete(s.Silence, id)
s.Unlock()
s.Save()
Expand Down

0 comments on commit e6a449b

Please sign in to comment.