manifest: add range annotations

This change adds a "range annotation" feature to Annotators , which are computations that aggregate some value over a specific key range within a level. Range annotations use the same B-tree caching behavior as regular annotations, so queries remain fast even with thousands of tables because they avoid a sequential iteration over a level's files. This PR only sets up range annotations without changing any existing behavior. See cockroachdb#3793 for some potential use cases. `BenchmarkNumFilesRangeAnnotation` shows that range annotations are significantly faster than using `version.Overlaps` to aggregate over a key range: ``` pkg: github.com/cockroachdb/pebble/internal/manifest BenchmarkNumFilesRangeAnnotation/annotator-10 306010 4015 ns/op 48 B/op 6 allocs/op BenchmarkNumFilesRangeAnnotation/overlaps-10 2223 513519 ns/op 336 B/op 8 allocs/op ```
anish-shanbhag · Aug 13, 2024 · 4776fa3 · 4776fa3
1 parent 3419a64
commit 4776fa3
Show file tree

Hide file tree

Showing 7 changed files with 286 additions and 79 deletions.
diff --git a/internal/manifest/annotator.go b/internal/manifest/annotator.go
@@ -4,6 +4,12 @@
 
 package manifest
 
+import (
+ "sort"
+
+ "github.com/cockroachdb/pebble/internal/base"
+)
+
 // The Annotator type defined below is used by other packages to lazily
 // compute a value over a B-Tree. Each node of the B-Tree stores one
 // `annotation` per annotator, containing the result of the computation over
@@ -24,6 +30,10 @@ package manifest
 // computed incrementally as edits are applied to a level.
 type Annotator[T any] struct {
  Aggregator AnnotationAggregator[T]
+
+ // scratch is used to hold the aggregated annotation value when computing
+ // range annotations in order to avoid additional allocations.
+ scratch *T
 }
 
 // An AnnotationAggregator defines how an annotation should be accumulated
@@ -116,6 +126,80 @@ func (a *Annotator[T]) nodeAnnotation(n *node) (_ *T, cacheOK bool) {
  return t, annot.valid
 }
 
+// accumulateRangeAnnotation computes this annotator's annotation across all
+// files in the node's subtree which overlap with the range defined by bounds.
+// The computed annotation is accumulated into a.scratch.
+func (a *Annotator[T]) accumulateRangeAnnotation(
+ n *node,
+ cmp base.Compare,
+ bounds base.UserKeyBounds,
+ // fullyWithinLowerBound and fullyWithinUpperBound indicate whether this
+ // node's subtree is already known to be within each bound.
+ fullyWithinLowerBound bool,
+ fullyWithinUpperBound bool,
+) {
+ // If this node's subtree is fully within the bounds, compute a regular
+ // annotation.
+ if fullyWithinLowerBound && fullyWithinUpperBound {
+ v, _ := a.nodeAnnotation(n)
+ a.scratch = a.Aggregator.Merge(v, a.scratch)
+ return
+ }
+
+ // We will accumulate annotations from each item in the end-exclusive
+ // range [leftItem, rightItem).
+ leftItem, rightItem := 0, int(n.count)
+ if !fullyWithinLowerBound {
+ // leftItem is the index of the first item that overlaps the lower bound.
+ leftItem = sort.Search(int(n.count), func(i int) bool {
+ return cmp(bounds.Start, n.items[i].Largest.UserKey) <= 0
+ })
+ }
+ if !fullyWithinUpperBound {
+ // rightItem is the index of the first item that does not overlap the
+ // upper bound.
+ rightItem = sort.Search(int(n.count), func(i int) bool {
+ return !bounds.End.IsUpperBoundFor(cmp, n.items[i].Smallest.UserKey)
+ })
+ }
+
+ // Accumulate annotations from every item that overlaps the bounds.
+ for i := leftItem; i < rightItem; i++ {
+ v, _ := a.Aggregator.Accumulate(n.items[i], a.scratch)
+ a.scratch = v
+ }
+
+ if !n.leaf {
+ // We will accumulate annotations from each child in the end-inclusive
+ // range [leftChild, rightChild].
+ leftChild, rightChild := leftItem, rightItem
+ // If the lower bound overlaps with the child at leftItem, there is no
+ // need to accumulate annotations from the child to its left.
+ if leftItem < int(n.count) && cmp(bounds.Start, n.items[leftItem].Smallest.UserKey) >= 0 {
+ leftChild++
+ }
+ // If the upper bound spans beyond the child at rightItem, we must also
+ // accumulate annotations from the child to its right.
+ if rightItem < int(n.count) && bounds.End.IsUpperBoundFor(cmp, n.items[rightItem].Largest.UserKey) {
+ rightChild++
+ }
+
+ for i := leftChild; i <= rightChild; i++ {
+ a.accumulateRangeAnnotation(
+ n.children[i],
+ cmp,
+ bounds,
+ // If this child is to the right of leftItem, then its entire
+ // subtree is within the lower bound.
+ fullyWithinLowerBound || i > leftItem,
+ // If this child is to the left of rightItem, then its entire
+ // subtree is within the upper bound.
+ fullyWithinUpperBound || i < rightItem,
+ )
+ }
+ }
+}
+
 // InvalidateAnnotation removes any existing cached annotations from this
 // annotator from a node's subtree.
 func (a *Annotator[T]) invalidateNodeAnnotation(n *node) {
@@ -142,8 +226,8 @@ func (a *Annotator[T]) LevelAnnotation(lm LevelMetadata) *T {
  return v
 }
 
-// LevelAnnotation calculates the annotation defined by this Annotator for all
-// files across the given levels. A pointer to the Annotator is used as the
+// MultiLevelAnnotation calculates the annotation defined by this Annotator for
+// all files across the given levels. A pointer to the Annotator is used as the
 // key for pre-calculated values, so the same Annotator must be used to avoid
 // duplicate computation. Annotation must not be called concurrently, and in
 // practice this is achieved by requiring callers to hold DB.mu.
@@ -158,6 +242,22 @@ func (a *Annotator[T]) MultiLevelAnnotation(lms []LevelMetadata) *T {
  return aggregated
 }
 
+// LevelRangeAnnotation calculates the annotation defined by this Annotator for
+// the files within LevelMetadata which are within the range
+// [lowerBound, upperBound). A pointer to the Annotator is used as the key for
+// pre-calculated values, so the same Annotator must be used to avoid duplicate
+// computation. Annotation must not be called concurrently, and in practice this
+// is achieved by requiring callers to hold DB.mu.
+func (a *Annotator[T]) LevelRangeAnnotation(lm LevelMetadata, bounds base.UserKeyBounds) *T {
+ if lm.Empty() {
+ return a.Aggregator.Zero(nil)
+ }
+
+ a.scratch = a.Aggregator.Zero(a.scratch)
+ a.accumulateRangeAnnotation(lm.tree.root, lm.tree.cmp, bounds, false, false)
+ return a.scratch
+}
+
 // InvalidateAnnotation clears any cached annotations defined by Annotator. A
 // pointer to the Annotator is used as the key for pre-calculated values, so
 // the same Annotator must be used to clear the appropriate cached annotation.
@@ -206,6 +306,14 @@ func SumAnnotator(accumulate func(f *FileMetadata) (v uint64, cacheOK bool)) *An
  }
 }
 
+// NumFilesAnnotator is an Annotator which computes an annotation value
+// equal to the number of files included in the annotation. Particularly, it
+// can be used to efficiently calculate the number of files in a given key
+// range using range annotations.
+var NumFilesAnnotator = SumAnnotator(func(f *FileMetadata) (uint64, bool) {
+ return 1, true
+})
+
 // PickFileAggregator implements the AnnotationAggregator interface. It defines
 // an aggregator that picks a single file from a set of eligible files.
 type PickFileAggregator struct {

diff --git a/internal/manifest/annotator_test.go b/internal/manifest/annotator_test.go
@@ -5,54 +5,47 @@
 package manifest
 
 import (
+ "math/rand"
  "testing"
 
  "github.com/cockroachdb/pebble/internal/base"
  "github.com/stretchr/testify/require"
 )
 
-func makeTestLevelMetadata(count int) (LevelMetadata, []*FileMetadata) {
- files := make([]*FileMetadata, count)
- for i := 0; i < count; i++ {
- files[i] = newItem(key(i))
+// Creates a version with numFiles files in level 6.
+func makeTestVersion(numFiles int) (*Version, []*FileMetadata) {
+ files := make([]*FileMetadata, numFiles)
+ for i := 0; i < numFiles; i++ {
+ // Each file spans 10 keys, e.g. [0->9], [10->19], etc.
+ files[i] = (&FileMetadata{}).ExtendPointKeyBounds(
+ base.DefaultComparer.Compare, key(i*10), key(i*10+9),
+ )
+ files[i].InitPhysicalBacking()
  }
 
- lm := MakeLevelMetadata(base.DefaultComparer.Compare, 6, files)
- return lm, files
-}
+ var levelFiles [7][]*FileMetadata
+ levelFiles[6] = files
 
-// NumFilesAnnotator is an Annotator which computes an annotation value
-// equal to the number of files included in the annotation.
-var NumFilesAnnotator = SumAnnotator(func(f *FileMetadata) (uint64, bool) {
- return 1, true
-})
+ v := NewVersion(base.DefaultComparer, 0, levelFiles)
+ return v, files
+}
 
 func TestNumFilesAnnotator(t *testing.T) {
  const count = 1000
- lm, _ := makeTestLevelMetadata(0)
+ v, _ := makeTestVersion(0)
 
  for i := 1; i <= count; i++ {
- lm.tree.Insert(newItem(key(i)))
- numFiles := *NumFilesAnnotator.LevelAnnotation(lm)
+ v.Levels[6].tree.Insert(newItem(key(i)))
+ numFiles := *NumFilesAnnotator.LevelAnnotation(v.Levels[6])
  require.EqualValues(t, i, numFiles)
  }
-
- numFiles := *NumFilesAnnotator.LevelAnnotation(lm)
- require.EqualValues(t, count, numFiles)
-
- numFiles = *NumFilesAnnotator.LevelAnnotation(lm)
- require.EqualValues(t, count, numFiles)
-
- lm.tree.Delete(newItem(key(count / 2)))
- numFiles = *NumFilesAnnotator.LevelAnnotation(lm)
- require.EqualValues(t, count-1, numFiles)
 }
 
 func BenchmarkNumFilesAnnotator(b *testing.B) {
- lm, _ := makeTestLevelMetadata(0)
+ v, _ := makeTestVersion(0)
  for i := 1; i <= b.N; i++ {
- lm.tree.Insert(newItem(key(i)))
- numFiles := *NumFilesAnnotator.LevelAnnotation(lm)
+ v.Levels[6].tree.Insert(newItem(key(i)))
+ numFiles := *NumFilesAnnotator.LevelAnnotation(v.Levels[6])
  require.EqualValues(b, uint64(i), numFiles)
  }
 }
@@ -70,12 +63,115 @@ func TestPickFileAggregator(t *testing.T) {
  },
  }
 
- lm, files := makeTestLevelMetadata(1)
+ v, files := makeTestVersion(1)
 
  for i := 1; i <= count; i++ {
- lm.tree.Insert(newItem(key(i)))
- pickedFile := a.LevelAnnotation(lm)
+ v.Levels[6].tree.Insert(newItem(key(i)))
+ pickedFile := a.LevelAnnotation(v.Levels[6])
  // The picked file should always be the one with the smallest key.
  require.Same(t, files[0], pickedFile)
  }
 }
+
+func bounds(i int, j int, exclusive bool) base.UserKeyBounds {
+ b := base.UserKeyBoundsEndExclusiveIf(key(i).UserKey, key(j).UserKey, exclusive)
+ return b
+}
+
+func randomBounds(rng *rand.Rand, count int) base.UserKeyBounds {
+ first := rng.Intn(count)
+ second := rng.Intn(count)
+ exclusive := rng.Intn(2) == 0
+ return bounds(min(first, second), max(first, second), exclusive)
+}
+
+func requireMatchOverlaps(t *testing.T, v *Version, bounds base.UserKeyBounds) {
+ overlaps := v.Overlaps(6, bounds)
+ numFiles := *NumFilesAnnotator.LevelRangeAnnotation(v.Levels[6], bounds)
+ require.EqualValues(t, overlaps.length, numFiles)
+}
+
+func TestNumFilesRangeAnnotationEmptyRanges(t *testing.T) {
+ const count = 5_000
+ v, files := makeTestVersion(count)
+
+ // Delete files containing key ranges [0, 999] and [24_000, 25_999].
+ for i := 0; i < 100; i++ {
+ v.Levels[6].tree.Delete(files[i])
+ }
+ for i := 2400; i < 2600; i++ {
+ v.Levels[6].tree.Delete(files[i])
+ }
+
+ // Ranges that are completely empty.
+ requireMatchOverlaps(t, v, bounds(1, 999, false))
+ requireMatchOverlaps(t, v, bounds(0, 1000, true))
+ requireMatchOverlaps(t, v, bounds(50_000, 60_000, false))
+ requireMatchOverlaps(t, v, bounds(24_500, 25_500, false))
+ requireMatchOverlaps(t, v, bounds(24_000, 26_000, true))
+
+ // Partial overlaps with empty ranges.
+ requireMatchOverlaps(t, v, bounds(0, 1000, false))
+ requireMatchOverlaps(t, v, bounds(20, 1001, true))
+ requireMatchOverlaps(t, v, bounds(20, 1010, true))
+ requireMatchOverlaps(t, v, bounds(23_000, 27_000, true))
+ requireMatchOverlaps(t, v, bounds(25_000, 40_000, false))
+ requireMatchOverlaps(t, v, bounds(25_500, 26_001, true))
+
+ // Ranges which only spans a single table.
+ requireMatchOverlaps(t, v, bounds(45_000, 45_000, true))
+ requireMatchOverlaps(t, v, bounds(30_000, 30_001, true))
+ requireMatchOverlaps(t, v, bounds(23_000, 23_000, false))
+}
+
+func TestNumFilesRangeAnnotationRandomized(t *testing.T) {
+ const count = 10_000
+ const numIterations = 10_000
+
+ v, _ := makeTestVersion(count)
+
+ rng := rand.New(rand.NewSource(int64(0)))
+ for i := 0; i < numIterations; i++ {
+ requireMatchOverlaps(t, v, randomBounds(rng, count*11))
+ }
+}
+
+func BenchmarkNumFilesRangeAnnotation(b *testing.B) {
+ const count = 100_000
+ v, files := makeTestVersion(count)
+
+ rng := rand.New(rand.NewSource(int64(0)))
+ b.Run("annotator", func(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ b := randomBounds(rng, count*11)
+ // Randomly delete and reinsert a file to verify that range
+ // annotations are still fast despite small mutations.
+ toDelete := rng.Intn(count)
+ v.Levels[6].tree.Delete(files[toDelete])
+
+ NumFilesAnnotator.LevelRangeAnnotation(v.Levels[6], b)
+
+ v.Levels[6].tree.Insert(files[toDelete])
+ }
+ })
+
+ // Also benchmark an equivalent aggregation using version.Overlaps to show
+ // the difference in performance.
+ b.Run("overlaps", func(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ b := randomBounds(rng, count*11)
+ toDelete := rng.Intn(count)
+ v.Levels[6].tree.Delete(files[toDelete])
+
+ overlaps := v.Overlaps(6, b)
+ iter := overlaps.Iter()
+ numFiles := 0
+ for f := iter.First(); f != nil; f = iter.Next() {
+ numFiles++
+ }
+
+ v.Levels[6].tree.Insert(files[toDelete])
+ }
+ })
+
+}