manifest: add range annotations

This change adds a "range annotation" feature to Annotators , which are computations that aggregate some value over a specific key range within within a level. Level-wide annotations are now computed internally as a range annotation with a key range spanning the whole level. Range annotations use the same B-tree caching behavior as regular annotations, so queries remain fast even with thousands of tables because they avoid a sequential iteration over a level's files. This PR only sets up range annotations without changing any existing behavior. See cockroachdb#3793 for some potential use cases. `BenchmarkNumFilesRangeAnnotation` shows that range annotations are significantly faster than using `version.Overlaps` to aggregate over a key range: ``` pkg: github.com/cockroachdb/pebble/internal/manifest BenchmarkNumFilesRangeAnnotation/annotator-10 232282 4716 ns/op 112 B/op 7 allocs/op BenchmarkNumFilesRangeAnnotation/overlaps-10 2110 545482 ns/op 400 B/op 9 allocs/op ```
anish-shanbhag · Aug 8, 2024 · 2d16b80 · 2d16b80
1 parent 3419a64
commit 2d16b80
Show file tree

Hide file tree

Showing 7 changed files with 306 additions and 98 deletions.
diff --git a/internal/manifest/annotator.go b/internal/manifest/annotator.go
@@ -4,6 +4,12 @@
 
 package manifest
 
+import (
+ "sort"
+
+ "github.com/cockroachdb/pebble/internal/base"
+)
+
 // The Annotator type defined below is used by other packages to lazily
 // compute a value over a B-Tree. Each node of the B-Tree stores one
 // `annotation` per annotator, containing the result of the computation over
@@ -59,6 +65,11 @@ type annotation struct {
  // AnnotationAggregator.Accumulate or AnnotationAggregator.Merge.
  // NB: This is untyped for the same reason as annotator above.
  v interface{}
+
+ // scratch is used to hold the aggregated annotation value when computing
+ // range annotations in order to avoid overwriting an already cached
+ // annotation, and to also avoid additional allocations.
+ scratch interface{}
  // valid indicates whether future reads of the annotation may use the
  // value as-is. If false, v will be zeroed and recalculated.
  valid bool
@@ -81,39 +92,114 @@ func (a *Annotator[T]) findAnnotation(n *node) *annotation {
  return &n.annot[len(n.annot)-1]
 }
 
-// nodeAnnotation computes this annotator's annotation of this node across all
-// files in the node's subtree. The second return value indicates whether the
-// annotation is stable and thus cacheable.
-func (a *Annotator[T]) nodeAnnotation(n *node) (_ *T, cacheOK bool) {
+// nodeRangeAnnotation computes this annotator's annotation of a node across
+// all files in the range defined by lowerBound and upperBound. The second
+// return value indicates whether the annotation is stable and thus cacheable.
+func (a *Annotator[T]) nodeRangeAnnotation(
+ n *node,
+ cmp base.Compare,
+ // lowerBound and upperBound may be nil to indicate no lower or upper bound.
+ lowerBound []byte,
+ // upperBound is a UserKeyBoundary that may be inclusive or exclusive.
+ upperBound *base.UserKeyBoundary,
+) (v *T, cacheOK bool) {
  annot := a.findAnnotation(n)
- t := annot.v.(*T)
- // If the annotation is already marked as valid, we can return it without
+ // If the annotation is already marked as valid and this node's
+ // subtree is fully within the bounds, we can return it without
  // recomputing anything.
- if annot.valid {
- return t, true
+ if lowerBound == nil && upperBound == nil && annot.valid {
+ return annot.v.(*T), true
+ }
+
+ // We will accumulate annotations from each item in the end-exclusive
+ // range [leftItem, rightItem).
+ leftItem, rightItem := 0, int(n.count)
+ if lowerBound != nil {
+ // leftItem is the index of the first item that overlaps the lower bound.
+ leftItem = sort.Search(int(n.count), func(i int) bool {
+ return cmp(lowerBound, n.items[i].Largest.UserKey) <= 0
+ })
+ }
+ if upperBound != nil {
+ // rightItem is the index of the first item that does not overlap the
+ // upper bound.
+ rightItem = sort.Search(int(n.count), func(i int) bool {
+ return !upperBound.IsUpperBoundFor(cmp, n.items[i].Smallest.UserKey)
+ })
+ }
+
+ var result *T
+ switch {
+ // If there is no cached annotation, we can directly write to the node's
+ // annotation value.
+ case !annot.valid:
+ result = a.Aggregator.Zero(annot.v.(*T))
+ // Otherwise, use annot.scratch as scratch space to avoid allocations.
+ // The allocation for annot.scratch is performed lazily here instead of
+ // within findAnnotation to avoid an allocation when range annotations
+ // are not used.
+ case annot.scratch == nil:
+ annot.scratch = a.Aggregator.Zero(nil)
+ result = annot.scratch.(*T)
+ default:
+ result = a.Aggregator.Zero(annot.scratch.(*T))
  }
 
- t = a.Aggregator.Zero(t)
  valid := true
+ // Accumulate annotations from every item that overlaps the bounds.
+ for i := leftItem; i < rightItem; i++ {
+ v, ok := a.Aggregator.Accumulate(n.items[i], result)
+ result = v
+ valid = valid && ok
+ }
 
- for i := int16(0); i <= n.count; i++ {
- if !n.leaf {
- v, ok := a.nodeAnnotation(n.children[i])
- t = a.Aggregator.Merge(v, t)
- valid = valid && ok
+ if !n.leaf {
+ // We will accumulate annotations from each child in the end-inclusive
+ // range [leftChild, rightChild].
+ leftChild, rightChild := leftItem, rightItem
+ // If the lower bound overlaps with the child at leftItem, there is no
+ // need to accumulate annotations from the child to its left.
+ if leftItem < int(n.count) && cmp(lowerBound, n.items[leftItem].Smallest.UserKey) >= 0 {
+ leftChild++
  }
+ // If the upper bound spans beyond the child at rightItem, we must also
+ // accumulate annotations from the child to its right.
+ if rightItem < int(n.count) && upperBound.IsUpperBoundFor(cmp, n.items[rightItem].Largest.UserKey) {
+ rightChild++
+ }
+
+ for i := leftChild; i <= rightChild; i++ {
+ newLowerBound, newUpperBound := lowerBound, upperBound
+ // If this child is to the right of leftItem, then its entire
+ // subtree is within the lower bound.
+ if i > leftItem {
+ newLowerBound = nil
+ }
+ // If this child is to the left of rightItem, then its entire
+ // subtree is within the upper bound.
+ if i < rightItem {
+ newUpperBound = nil
+ }
 
- if i < n.count {
- var ok bool
- t, ok = a.Aggregator.Accumulate(n.items[i], t)
+ v, ok := a.nodeRangeAnnotation(
+ n.children[i],
+ cmp,
+ newLowerBound,
+ newUpperBound,
+ )
+ result = a.Aggregator.Merge(v, result)
  valid = valid && ok
  }
  }
 
- annot.v = t
- annot.valid = valid
+ // Update this node's cached annotation only if we accumulated from its
+ // entire subtree.
+ if lowerBound == nil && upperBound == nil {
+ annot.v = result
+ annot.valid = valid
+ }
 
- return t, annot.valid
+ return result, valid
 }
 
 // InvalidateAnnotation removes any existing cached annotations from this
@@ -138,7 +224,7 @@ func (a *Annotator[T]) LevelAnnotation(lm LevelMetadata) *T {
  return a.Aggregator.Zero(nil)
  }
 
- v, _ := a.nodeAnnotation(lm.tree.root)
+ v, _ := a.nodeRangeAnnotation(lm.tree.root, lm.tree.cmp, nil, nil)
  return v
 }
 
@@ -158,6 +244,21 @@ func (a *Annotator[T]) MultiLevelAnnotation(lms []LevelMetadata) *T {
  return aggregated
 }
 
+// LevelRangeAnnotation calculates the annotation defined by this Annotator for
+// the files within LevelMetadata which are within the range
+// [lowerBound, upperBound). A pointer to the Annotator is used as the key for
+// pre-calculated values, so the same Annotator must be used to avoid duplicate
+// computation. Annotation must not be called concurrently, and in practice this
+// is achieved by requiring callers to hold DB.mu.
+func (a *Annotator[T]) LevelRangeAnnotation(lm LevelMetadata, bounds *base.UserKeyBounds) *T {
+ if lm.Empty() {
+ return a.Aggregator.Zero(nil)
+ }
+
+ v, _ := a.nodeRangeAnnotation(lm.tree.root, lm.tree.cmp, bounds.Start, &bounds.End)
+ return v
+}
+
 // InvalidateAnnotation clears any cached annotations defined by Annotator. A
 // pointer to the Annotator is used as the key for pre-calculated values, so
 // the same Annotator must be used to clear the appropriate cached annotation.
@@ -206,6 +307,14 @@ func SumAnnotator(accumulate func(f *FileMetadata) (v uint64, cacheOK bool)) *An
  }
 }
 
+// NumFilesAnnotator is an Annotator which computes an annotation value
+// equal to the number of files included in the annotation. Particularly, it
+// can be used to efficiently calculate the number of files in a given key
+// range using range annotations.
+var NumFilesAnnotator = SumAnnotator(func(f *FileMetadata) (uint64, bool) {
+ return 1, true
+})
+
 // PickFileAggregator implements the AnnotationAggregator interface. It defines
 // an aggregator that picks a single file from a set of eligible files.
 type PickFileAggregator struct {