Add EMAThroughput sampler

honeycombio · Mar 19, 2023 · 3fb72a1 · 3fb72a1
1 parent 84028f7
commit 3fb72a1
Show file tree

Hide file tree

Showing 3 changed files with 636 additions and 1 deletion.
diff --git a/emathroughput.go b/emathroughput.go
@@ -0,0 +1,359 @@
+package dynsampler
+
+import (
+ "encoding/json"
+ "errors"
+ "fmt"
+ "math"
+ "sync"
+ "time"
+)
+
+// EMAThroughput implements Sampler and attempts to achieve a given throughput
+// rate, weighting rare traffic and frequent traffic differently so as to end up
+// with the correct value.
+//
+// Based on the EMASampleRate implementation, EMAThroughput differs in that
+// instead of trying to achieve a given sample rate, it tries to reach a given
+// throughput of events. During bursts of traffic, it will reduce sample
+// rates so as to keep the number of events per second roughly constant.
+//
+// Like the EMA sampler, it maintains an Exponential Moving Average of counts
+// seen per key, and adjusts this average at regular intervals. The weight
+// applied to more recent intervals is defined by `weight`, a number between (0,
+// 1) - larger values weight the average more toward recent observations. In
+// other words, a larger weight will cause sample rates more quickly adapt to
+// traffic patterns, while a smaller weight will result in sample rates that are
+// less sensitive to bursts or drops in traffic and thus more consistent over
+// time.
+//
+// New keys that are not found in the EMA will always have a sample
+// rate of 1. Keys that occur more frequently will be sampled on a logarithmic
+// curve. In other words, every key will be represented at least once in any
+// given window and more frequent keys will have their sample rate
+// increased proportionally to wind up with the goal throughput.
+type EMAThroughput struct {
+ // AdjustmentInterval defines how often we adjust the moving average from
+ // recent observations. Default 15s.
+ AdjustmentInterval time.Duration
+
+ // Weight is a value between (0, 1) indicating the weighting factor used to adjust
+ // the EMA. With larger values, newer data will influence the average more, and older
+ // values will be factored out more quickly. In mathematical literature concerning EMA,
+ // this is referred to as the `alpha` constant.
+ // Default is 0.5
+ Weight float64
+
+ // InitialSampleRate is the sample rate to use during startup, before we
+ // have accumulated enough data to calculate a reasonable desired sample
+ // rate. This is mainly useful in situations where unsampled throughput is
+ // high enough to cause problems.
+ // Default 10.
+ InitialSampleRate int
+
+ // GoalThroughputPerSec is the target number of events to send per second.
+ // Sample rates are generated to squash the total throughput down to match the
+ // goal throughput. Actual throughput may exceed goal throughput. default 100
+ GoalThroughputPerSec int
+
+ // MaxKeys, if greater than 0, limits the number of distinct keys tracked in EMA.
+ // Once MaxKeys is reached, new keys will not be included in the sample rate map, but
+ // existing keys will continue to be be counted.
+ MaxKeys int
+
+ // AgeOutValue indicates the threshold for removing keys from the EMA. The EMA of any key will approach 0
+ // if it is not repeatedly observed, but will never truly reach it, so we have to decide what constitutes "zero".
+ // Keys with averages below this threshold will be removed from the EMA. Default is the same as Weight, as this prevents
+ // a key with the smallest integer value (1) from being aged out immediately. This value should generally be <= Weight,
+ // unless you have very specific reasons to set it higher.
+ AgeOutValue float64
+
+ // BurstMultiple, if set, is multiplied by the sum of the running average of counts to define
+ // the burst detection threshold. If total counts observed for a given interval exceed the threshold
+ // EMA is updated immediately, rather than waiting on the AdjustmentInterval.
+ // Defaults to 2; negative value disables. With a default of 2, if your traffic suddenly doubles,
+ // burst detection will kick in.
+ BurstMultiple float64
+
+ // BurstDetectionDelay indicates the number of intervals to run after Start is called before burst detection kicks in.
+ // Defaults to 3
+ BurstDetectionDelay uint
+
+ savedSampleRates map[string]int
+ currentCounts map[string]float64
+ movingAverage map[string]float64
+ burstThreshold float64
+ currentBurstSum float64
+ intervalCount uint
+ burstSignal chan struct{}
+
+ // haveData indicates that we have gotten a sample of traffic. Before we've
+ // gotten any samples of traffic, we should use the default goal sample rate
+ // for all events instead of sampling everything at 1
+ haveData bool
+ updating bool
+ done chan struct{}
+
+ lock sync.Mutex
+
+ // used only in tests
+ testSignalMapsDone chan struct{}
+}
+
+// Ensure we implement the sampler interface
+var _ Sampler = (*EMAThroughput)(nil)
+
+func (e *EMAThroughput) Start() error {
+ // apply defaults
+ if e.AdjustmentInterval == 0 {
+ e.AdjustmentInterval = 15 * time.Second
+ }
+ if e.AdjustmentInterval < 1*time.Millisecond {
+ return fmt.Errorf("the AdjustmentInterval %v is unreasonably short for a throughput sampler", e.AdjustmentInterval)
+ }
+ if e.InitialSampleRate == 0 {
+ e.InitialSampleRate = 10
+ }
+ if e.GoalThroughputPerSec == 0 {
+ e.GoalThroughputPerSec = 100
+ }
+ if e.Weight == 0 {
+ e.Weight = 0.5
+ }
+ if e.AgeOutValue == 0 {
+ e.AgeOutValue = e.Weight
+ }
+ if e.BurstMultiple == 0 {
+ e.BurstMultiple = 2
+ }
+ if e.BurstDetectionDelay == 0 {
+ e.BurstDetectionDelay = 3
+ }
+
+ // Don't override these maps at startup in case they were loaded from a previous state
+ e.currentCounts = make(map[string]float64)
+ if e.savedSampleRates == nil {
+ e.savedSampleRates = make(map[string]int)
+ }
+ if e.movingAverage == nil {
+ e.movingAverage = make(map[string]float64)
+ }
+ e.burstSignal = make(chan struct{})
+ e.done = make(chan struct{})
+
+ go func() {
+ ticker := time.NewTicker(e.AdjustmentInterval)
+ defer ticker.Stop()
+ for {
+ select {
+ case <-e.burstSignal:
+ // reset ticker when we get a burst
+ ticker.Stop()
+ ticker = time.NewTicker(e.AdjustmentInterval)
+ e.updateMaps()
+ case <-ticker.C:
+ e.updateMaps()
+ e.intervalCount++
+ case <-e.done:
+ return
+ }
+ }
+ }()
+ return nil
+}
+
+func (e *EMAThroughput) Stop() error {
+ close(e.done)
+ return nil
+}
+
+// updateMaps calculates a new saved rate map based on the contents of the
+// counter map
+func (e *EMAThroughput) updateMaps() {
+ e.lock.Lock()
+ if e.testSignalMapsDone != nil {
+ defer func() {
+ e.testSignalMapsDone <- struct{}{}
+ }()
+ }
+ // short circuit if no traffic
+ if len(e.currentCounts) == 0 {
+ // No traffic the last interval, don't update anything. This is deliberate to avoid
+ // the average decaying when there's no traffic (comes in bursts, or there's some kind of outage).
+ e.lock.Unlock()
+ return
+ }
+ // If there is another updateMaps going, bail
+ if e.updating {
+ e.lock.Unlock()
+ return
+ }
+ e.updating = true
+ // make a local copy of the sample counters for calculation
+ tmpCounts := e.currentCounts
+ e.currentCounts = make(map[string]float64)
+ e.currentBurstSum = 0
+ e.lock.Unlock()
+
+ e.updateEMA(tmpCounts)
+
+ // Goal events to send this interval is the total count of events in the EMA
+ // divided by the desired average sample rate
+ var sumEvents float64
+ for _, count := range e.movingAverage {
+ sumEvents += math.Max(1, count)
+ }
+
+ // Store this for burst detection. This is checked in GetSampleRate
+ // so we need to grab the lock when we update it.
+ e.lock.Lock()
+ e.burstThreshold = sumEvents * e.BurstMultiple
+ e.lock.Unlock()
+
+ // Calculate the desired average sample rate per second based on the volume we've received.
+ // InitialSampleRate := float64(sumEvents) / e.AdjustmentInterval.Seconds() / float64(e.GoalThroughputPerSec)
+ // goalCount := float64(sumEvents) / InitialSampleRate
+
+ // Calculate the number of events we'd like to let through per adjustment interval
+ goalCount := float64(e.GoalThroughputPerSec) / e.AdjustmentInterval.Seconds()
+
+ // goalRatio is the goalCount divided by the sum of all the log values - it
+ // determines what percentage of the total event space belongs to each key
+ var logSum float64
+ for _, count := range e.movingAverage {
+ // We take the max of (1, count) because count * weight is < 1 for
+ // very small counts, which throws off the logSum and can cause
+ // incorrect samples rates to be computed when throughput is low
+ logSum += math.Log10(math.Max(1, count))
+ }
+ goalRatio := goalCount / logSum
+
+ newSavedSampleRates := calculateSampleRates(goalRatio, e.movingAverage)
+ e.lock.Lock()
+ defer e.lock.Unlock()
+ e.savedSampleRates = newSavedSampleRates
+ e.haveData = true
+ e.updating = false
+}
+
+// GetSampleRate takes a key and returns the appropriate sample rate for that
+// key.
+func (e *EMAThroughput) GetSampleRate(key string) int {
+ return e.GetSampleRateMulti(key, 1)
+}
+
+// GetSampleRateMulti takes a key representing count spans and returns the
+// appropriate sample rate for that key.
+func (e *EMAThroughput) GetSampleRateMulti(key string, count int) int {
+ e.lock.Lock()
+ defer e.lock.Unlock()
+
+ // Enforce MaxKeys limit on the size of the map
+ if e.MaxKeys > 0 {
+ // If a key already exists, increment it. If not, but we're under the limit, store a new key
+ if _, found := e.currentCounts[key]; found || len(e.currentCounts) < e.MaxKeys {
+ e.currentCounts[key] += float64(count)
+ e.currentBurstSum += float64(count)
+ }
+ } else {
+ e.currentCounts[key] += float64(count)
+ e.currentBurstSum += float64(count)
+ }
+
+ // Enforce the burst threshold
+ if e.burstThreshold > 0 && e.currentBurstSum >= e.burstThreshold && e.intervalCount >= e.BurstDetectionDelay {
+ // reset the burst sum to prevent additional burst updates from occurring while updateMaps is running
+ e.currentBurstSum = 0
+ // send but don't block - consuming is blocked on updateMaps, which takes the same lock we're holding
+ select {
+ case e.burstSignal <- struct{}{}:
+ default:
+ }
+ }
+
+ if !e.haveData {
+ return e.InitialSampleRate
+ }
+ if rate, found := e.savedSampleRates[key]; found {
+ return rate
+ }
+ return 1
+}
+
+func (e *EMAThroughput) updateEMA(newCounts map[string]float64) {
+ keysToUpdate := make([]string, 0, len(e.movingAverage))
+ for key := range e.movingAverage {
+ keysToUpdate = append(keysToUpdate, key)
+ }
+
+ // Update any existing keys with new values
+ for _, key := range keysToUpdate {
+ var newAvg float64
+ // Was this key seen in the last interval? Adjust by that amount
+ if val, found := newCounts[key]; found {
+ newAvg = adjustAverage(e.movingAverage[key], val, e.Weight)
+ } else {
+ // Otherwise adjust by zero
+ newAvg = adjustAverage(e.movingAverage[key], 0, e.Weight)
+ }
+
+ // Age out this value if it's too small to care about for calculating sample rates
+ // This is also necessary to keep our map from going forever.
+ if newAvg < e.AgeOutValue {
+ delete(e.movingAverage, key)
+ } else {
+ e.movingAverage[key] = newAvg
+ }
+ // We've processed this key - don't process it again when we look at new counts
+ delete(newCounts, key)
+ }
+
+ for key := range newCounts {
+ newAvg := adjustAverage(0, newCounts[key], e.Weight)
+ if newAvg >= e.AgeOutValue {
+ e.movingAverage[key] = newAvg
+ }
+ }
+}
+
+type emaThroughputState struct {
+ // These fields are exported for use by `JSON.Marshal` and `JSON.Unmarshal`
+ SavedSampleRates map[string]int `json:"saved_sample_rates"`
+ MovingAverage map[string]float64 `json:"moving_average"`
+}
+
+// SaveState returns a byte array with a JSON representation of the sampler state
+func (e *EMAThroughput) SaveState() ([]byte, error) {
+ e.lock.Lock()
+ defer e.lock.Unlock()
+
+ if e.savedSampleRates == nil {
+ return nil, errors.New("saved sample rate map is nil")
+ }
+ if e.movingAverage == nil {
+ return nil, errors.New("moving average map is nil")
+ }
+ s := &emaThroughputState{SavedSampleRates: e.savedSampleRates, MovingAverage: e.movingAverage}
+ return json.Marshal(s)
+}
+
+// LoadState accepts a byte array with a JSON representation of a previous instance's
+// state
+func (e *EMAThroughput) LoadState(state []byte) error {
+ e.lock.Lock()
+ defer e.lock.Unlock()
+
+ s := emaThroughputState{}
+ err := json.Unmarshal(state, &s)
+ if err != nil {
+ return err
+ }
+
+ // Load the previously calculated sample rates
+ e.savedSampleRates = s.SavedSampleRates
+ e.movingAverage = s.MovingAverage
+ // Allow GetSampleRate to return calculated sample rates from the loaded map
+ e.haveData = true
+
+ return nil
+}