manifest: add range annotations

This change adds a "range annotation" feature to Annotators , which are computations that aggregate some value over a specific key range within within a level. Level-wide annotations are now computed internally as a range annotation with a key range spanning the whole level. Range annotations use the same B-tree caching behavior as regular annotations, so queries remain fast even with thousands of tables because they avoid a sequential iteration over a level's files. This PR only sets up range annotations without changing any existing behavior. See cockroachdb#3793 for some potential use cases. `BenchmarkNumFilesRangeAnnotation` shows that range annotations are significantly faster than using `version.Overlaps` to aggregate over a key range: ``` pkg: github.com/cockroachdb/pebble/internal/manifest BenchmarkNumFilesRangeAnnotation/annotator-10 232282 4716 ns/op 112 B/op 7 allocs/op BenchmarkNumFilesRangeAnnotation/overlaps-10 2110 545482 ns/op 400 B/op 9 allocs/op ```
anish-shanbhag · Jul 26, 2024 · 282a3b1 · 282a3b1
1 parent e36d078
commit 282a3b1
Show file tree

Hide file tree

Showing 7 changed files with 306 additions and 95 deletions.
diff --git a/internal/manifest/annotator.go b/internal/manifest/annotator.go
@@ -4,6 +4,12 @@
 
 package manifest
 
+import (
+ "sort"
+
+ "github.com/cockroachdb/pebble/internal/base"
+)
+
 // The Annotator type defined below is used by other packages to lazily
 // compute a value over a B-Tree. Each node of the B-Tree stores one
 // `annotation` per annotator, containing the result of the computation over
@@ -59,6 +65,11 @@ type annotation struct {
  // AnnotationAggregator.Accumulate or AnnotationAggregator.Merge.
  // NB: This is untyped for the same reason as annotator above.
  v interface{}
+
+ // scratch is used to hold the aggregated annotation value when computing
+ // range annotations in order to avoid overwriting an already cached
+ // annotation, and to also avoid additional allocations.
+ scratch interface{}
  // valid indicates whether future reads of the annotation may use the
  // value as-is. If false, v will be zeroed and recalculated.
  valid bool
@@ -81,36 +92,114 @@ func (a *Annotator[T]) findAnnotation(n *node) *annotation {
  return &n.annot[len(n.annot)-1]
 }
 
-// nodeAnnotation computes this annotator's annotation of this node across all
-// files in the node's subtree. The second return value indicates whether the
-// annotation is stable and thus cacheable.
-func (a *Annotator[T]) nodeAnnotation(n *node) (v *T, cacheOK bool) {
+// nodeRangeAnnotation computes this annotator's annotation of a node across
+// all files in the range defined by lowerBound and upperBound. The second
+// return value indicates whether the annotation is stable and thus cacheable.
+func (a *Annotator[T]) nodeRangeAnnotation(
+ n *node,
+ cmp base.Compare,
+ // lowerBound and upperBound may be nil to indicate no lower or upper bound.
+ lowerBound []byte,
+ // upperBound is a UserKeyBoundary that may be inclusive or exclusive.
+ upperBound *base.UserKeyBoundary,
+) (v *T, cacheOK bool) {
  annot := a.findAnnotation(n)
- vtyped := annot.v.(*T)
- // If the annotation is already marked as valid, we can return it without
+ // If the annotation is already marked as valid and this node's
+ // subtree is fully within the bounds, we can return it without
  // recomputing anything.
- if annot.valid {
- return vtyped, true
+ if lowerBound == nil && upperBound == nil && annot.valid {
+ return annot.v.(*T), true
  }
 
- annot.v = a.Aggregator.Zero(vtyped)
- annot.valid = true
+ // We will accumulate annotations from each item in the end-exclusive
+ // range [leftItem, rightItem).
+ leftItem, rightItem := 0, int(n.count)
+ if lowerBound != nil {
+ // leftItem is the index of the first item that overlaps the lower bound.
+ leftItem = sort.Search(int(n.count), func(i int) bool {
+ return cmp(lowerBound, n.items[i].Largest.UserKey) <= 0
+ })
+ }
+ if upperBound != nil {
+ // rightItem is the index of the first item that does not overlap the
+ // upper bound.
+ rightItem = sort.Search(int(n.count), func(i int) bool {
+ return !upperBound.IsUpperBoundFor(cmp, n.items[i].Smallest.UserKey)
+ })
+ }
 
- for i := int16(0); i <= n.count; i++ {
- if !n.leaf {
- v, ok := a.nodeAnnotation(n.children[i])
- annot.v = a.Aggregator.Merge(v, vtyped)
- annot.valid = annot.valid && ok
+ var result *T
+ switch {
+ // If there is no cached annotation, we can directly write to the node's
+ // annotation value.
+ case !annot.valid:
+ result = a.Aggregator.Zero(annot.v.(*T))
+ // Otherwise, use annot.scratch as scratch space to avoid allocations.
+ // The allocation for annot.scratch is performed lazily here instead of
+ // within findAnnotation to avoid an allocation when range annotations
+ // are not used.
+ case annot.scratch == nil:
+ annot.scratch = a.Aggregator.Zero(nil)
+ result = annot.scratch.(*T)
+ default:
+ result = a.Aggregator.Zero(annot.scratch.(*T))
+ }
+
+ valid := true
+ // Accumulate annotations from every item that overlaps the bounds.
+ for i := leftItem; i < rightItem; i++ {
+ v, ok := a.Aggregator.Accumulate(n.items[i], result)
+ result = v
+ valid = valid && ok
+ }
+
+ if !n.leaf {
+ // We will accumulate annotations from each child in the end-inclusive
+ // range [leftChild, rightChild].
+ leftChild, rightChild := leftItem, rightItem
+ // If the lower bound overlaps with the child at leftItem, there is no
+ // need to accumulate annotations from the child to its left.
+ if leftItem < int(n.count) && cmp(lowerBound, n.items[leftItem].Smallest.UserKey) >= 0 {
+ leftChild++
+ }
+ // If the upper bound spans beyond the child at rightItem, we must also
+ // accumulate annotations from the child to its right.
+ if rightItem < int(n.count) && upperBound.IsUpperBoundFor(cmp, n.items[rightItem].Largest.UserKey) {
+ rightChild++
  }
 
- if i < n.count {
- v, ok := a.Aggregator.Accumulate(n.items[i], vtyped)
- annot.v = v
- annot.valid = annot.valid && ok
+ for i := leftChild; i <= rightChild; i++ {
+ newLowerBound, newUpperBound := lowerBound, upperBound
+ // If this child is to the right of leftItem, then its entire
+ // subtree is within the lower bound.
+ if i > leftItem {
+ newLowerBound = nil
+ }
+ // If this child is to the left of rightItem, then its entire
+ // subtree is within the upper bound.
+ if i < rightItem {
+ newUpperBound = nil
+ }
+
+ v, ok := a.nodeRangeAnnotation(
+ n.children[i],
+ cmp,
+ newLowerBound,
+ newUpperBound,
+ )
+ result = a.Aggregator.Merge(v, result)
+ valid = valid && ok
  }
  }
 
- return annot.v.(*T), annot.valid
+ // Update this node's cached annotation only if we accumulated from its
+ // entire subtree.
+ if lowerBound == nil && upperBound == nil {
+ annot.v = result
+ annot.valid = valid
+ }
+
+ return result, valid
 }
 
 // InvalidateAnnotation removes any existing cached annotations from this
@@ -135,7 +224,7 @@ func (a *Annotator[T]) LevelAnnotation(lm LevelMetadata) *T {
  return a.Aggregator.Zero(nil)
  }
 
- v, _ := a.nodeAnnotation(lm.tree.root)
+ v, _ := a.nodeRangeAnnotation(lm.tree.root, lm.tree.cmp, nil, nil)
  return v
 }
 
@@ -155,6 +244,21 @@ func (a *Annotator[T]) MultiLevelAnnotation(lms []LevelMetadata) *T {
  return aggregated
 }
 
+// LevelRangeAnnotation calculates the annotation defined by this Annotator for
+// the files within LevelMetadata which are within the range
+// [lowerBound, upperBound). A pointer to the Annotator is used as the key for
+// pre-calculated values, so the same Annotator must be used to avoid duplicate
+// computation. Annotation must not be called concurrently, and in practice this
+// is achieved by requiring callers to hold DB.mu.
+func (a *Annotator[T]) LevelRangeAnnotation(lm LevelMetadata, bounds *base.UserKeyBounds) *T {
+ if lm.Empty() {
+ return a.Aggregator.Zero(nil)
+ }
+
+ v, _ := a.nodeRangeAnnotation(lm.tree.root, lm.tree.cmp, bounds.Start, &bounds.End)
+ return v
+}
+
 // InvalidateAnnotation clears any cached annotations defined by Annotator. A
 // pointer to the Annotator is used as the key for pre-calculated values, so
 // the same Annotator must be used to clear the appropriate cached annotation.
@@ -203,6 +307,14 @@ func SumAnnotator(accumulate func(f *FileMetadata) (v uint64, cacheOK bool)) *An
  }
 }
 
+// NumFilesAnnotator is an Annotator which computes an annotation value
+// equal to the number of files included in the annotation. Particularly, it
+// can be used to efficiently calculate the number of files in a given key
+// range using range annotations.
+var NumFilesAnnotator = SumAnnotator(func(f *FileMetadata) (uint64, bool) {
+ return 1, true
+})
+
 // PickFileAggregator implements the AnnotationAggregator interface. It defines
 // an aggregator that picks a single file from a set of eligible files.
 type PickFileAggregator struct {

diff --git a/internal/manifest/annotator_test.go b/internal/manifest/annotator_test.go
@@ -5,54 +5,150 @@
 package manifest
 
 import (
+ "math/rand"
  "testing"
 
  "github.com/cockroachdb/pebble/internal/base"
  "github.com/stretchr/testify/require"
 )
 
-func makeTestLevelMetadata(count int) (LevelMetadata, []*FileMetadata) {
- files := make([]*FileMetadata, count)
- for i := 0; i < count; i++ {
- files[i] = newItem(key(i))
+// Creates a version with numFiles files in level 6.
+func makeTestVersion(numFiles int) (*Version, []*FileMetadata) {
+ files := make([]*FileMetadata, numFiles)
+ for i := 0; i < numFiles; i++ {
+ // Each file spans 10 keys, e.g. [0->9], [10->19], etc.
+ files[i] = (&FileMetadata{}).ExtendPointKeyBounds(
+ base.DefaultComparer.Compare, key(i*10), key(i*10+9),
+ )
+ files[i].InitPhysicalBacking()
  }
 
- lm := MakeLevelMetadata(base.DefaultComparer.Compare, 6, files)
- return lm, files
-}
+ var levelFiles [7][]*FileMetadata
+ levelFiles[6] = files
 
-// NumFilesAnnotator is an Annotator which computes an annotation value
-// equal to the number of files included in the annotation.
-var NumFilesAnnotator = SumAnnotator(func(f *FileMetadata) (uint64, bool) {
- return 1, true
-})
+ v := NewVersion(base.DefaultComparer, 0, levelFiles)
+ return v, files
+}
 
 func TestNumFilesAnnotator(t *testing.T) {
  const count = 1000
- lm, _ := makeTestLevelMetadata(0)
+ v, _ := makeTestVersion(0)
 
  for i := 1; i <= count; i++ {
- lm.tree.Insert(newItem(key(i)))
- numFiles := *NumFilesAnnotator.LevelAnnotation(lm)
+ v.Levels[6].tree.Insert(newItem(key(i)))
+ numFiles := *NumFilesAnnotator.LevelAnnotation(v.Levels[6])
  require.EqualValues(t, i, numFiles)
  }
-
- numFiles := *NumFilesAnnotator.LevelAnnotation(lm)
- require.EqualValues(t, count, numFiles)
-
- numFiles = *NumFilesAnnotator.LevelAnnotation(lm)
- require.EqualValues(t, count, numFiles)
-
- lm.tree.Delete(newItem(key(count / 2)))
- numFiles = *NumFilesAnnotator.LevelAnnotation(lm)
- require.EqualValues(t, count-1, numFiles)
 }
 
 func BenchmarkNumFilesAnnotator(b *testing.B) {
- lm, _ := makeTestLevelMetadata(0)
+ v, _ := makeTestVersion(0)
  for i := 1; i <= b.N; i++ {
- lm.tree.Insert(newItem(key(i)))
- numFiles := *NumFilesAnnotator.LevelAnnotation(lm)
+ v.Levels[6].tree.Insert(newItem(key(i)))
+ numFiles := *NumFilesAnnotator.LevelAnnotation(v.Levels[6])
  require.EqualValues(b, uint64(i), numFiles)
  }
 }
+
+func bounds(i int, j int, exclusive bool) *base.UserKeyBounds {
+ b := base.UserKeyBoundsEndExclusiveIf(key(i).UserKey, key(j).UserKey, exclusive)
+ return &b
+}
+
+func randomBounds(rng *rand.Rand, count int) *base.UserKeyBounds {
+ first := rng.Intn(count)
+ second := rng.Intn(count)
+ exclusive := rng.Intn(2) == 0
+ return bounds(min(first, second), max(first, second), exclusive)
+}
+
+func requireMatchOverlaps(t *testing.T, v *Version, bounds *base.UserKeyBounds) {
+ overlaps := v.Overlaps(6, *bounds)
+ numFiles := *NumFilesAnnotator.LevelRangeAnnotation(v.Levels[6], bounds)
+ require.EqualValues(t, overlaps.length, numFiles)
+}
+
+func TestNumFilesRangeAnnotationEmptyRanges(t *testing.T) {
+ const count = 5_000
+ v, files := makeTestVersion(count)
+
+ // Delete files containing key ranges [0, 999] and [24_000, 25_999].
+ for i := 0; i < 100; i++ {
+ v.Levels[6].tree.Delete(files[i])
+ }
+ for i := 2400; i < 2600; i++ {
+ v.Levels[6].tree.Delete(files[i])
+ }
+
+ // Ranges that are completely empty.
+ requireMatchOverlaps(t, v, bounds(1, 999, false))
+ requireMatchOverlaps(t, v, bounds(0, 1000, true))
+ requireMatchOverlaps(t, v, bounds(50_000, 60_000, false))
+ requireMatchOverlaps(t, v, bounds(24_500, 25_500, false))
+ requireMatchOverlaps(t, v, bounds(24_000, 26_000, true))
+
+ // Partial overlaps with empty ranges.
+ requireMatchOverlaps(t, v, bounds(0, 1000, false))
+ requireMatchOverlaps(t, v, bounds(20, 1001, true))
+ requireMatchOverlaps(t, v, bounds(20, 1010, true))
+ requireMatchOverlaps(t, v, bounds(23_000, 27_000, true))
+ requireMatchOverlaps(t, v, bounds(25_000, 40_000, false))
+ requireMatchOverlaps(t, v, bounds(25_500, 26_001, true))
+
+ // Ranges which only spans a single table.
+ requireMatchOverlaps(t, v, bounds(45_000, 45_000, true))
+ requireMatchOverlaps(t, v, bounds(30_000, 30_001, true))
+ requireMatchOverlaps(t, v, bounds(23_000, 23_000, false))
+}
+
+func TestNumFilesRangeAnnotationRandomized(t *testing.T) {
+ const count = 10_000
+ const numIterations = 10_000
+
+ v, _ := makeTestVersion(count)
+
+ rng := rand.New(rand.NewSource(int64(0)))
+ for i := 0; i < numIterations; i++ {
+ requireMatchOverlaps(t, v, randomBounds(rng, count*11))
+ }
+}
+
+func BenchmarkNumFilesRangeAnnotation(b *testing.B) {
+ const count = 100_000
+ v, files := makeTestVersion(count)
+
+ rng := rand.New(rand.NewSource(int64(0)))
+ b.Run("annotator", func(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ b := randomBounds(rng, count*11)
+ // Randomly delete and reinsert a file to verify that range
+ // annotations are still fast despite small mutations.
+ toDelete := rng.Intn(count)
+ v.Levels[6].tree.Delete(files[toDelete])
+
+ NumFilesAnnotator.LevelRangeAnnotation(v.Levels[6], b)
+
+ v.Levels[6].tree.Insert(files[toDelete])
+ }
+ })
+
+ // Also benchmark an equivalent aggregation using version.Overlaps to show
+ // the difference in performance.
+ b.Run("overlaps", func(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ b := randomBounds(rng, count*11)
+ toDelete := rng.Intn(count)
+ v.Levels[6].tree.Delete(files[toDelete])
+
+ overlaps := v.Overlaps(6, *b)
+ iter := overlaps.Iter()
+ numFiles := 0
+ for f := iter.First(); f != nil; f = iter.Next() {
+ numFiles++
+ }
+
+ v.Levels[6].tree.Insert(files[toDelete])
+ }
+ })
+
+}