Skip to content

Commit

Permalink
manifest: add range annotations
Browse files Browse the repository at this point in the history
This change adds a "range annotation" feature to Annotators , which
are computations that aggregate some value over a specific key range within a level. Range annotations use the same B-tree caching behavior as regular annotations, so queries remain fast even with thousands of tables because they avoid a sequential iteration over a level's files.

This PR only sets up range annotations without changing any existing
behavior. See cockroachdb#3793 for some potential use cases.

`BenchmarkNumFilesRangeAnnotation` shows that range annotations are
significantly faster than using `version.Overlaps` to aggregate over
a key range:
```
pkg: github.com/cockroachdb/pebble/internal/manifest
BenchmarkNumFilesRangeAnnotation/annotator-10         	  306010	      4015 ns/op	      48 B/op	       6 allocs/op
BenchmarkNumFilesRangeAnnotation/overlaps-10          	    2223	    513519 ns/op	     336 B/op	       8 allocs/op
```
  • Loading branch information
anish-shanbhag committed Aug 13, 2024
1 parent 3419a64 commit 4776fa3
Show file tree
Hide file tree
Showing 7 changed files with 286 additions and 79 deletions.
112 changes: 110 additions & 2 deletions internal/manifest/annotator.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@

package manifest

import (
"sort"

"github.com/cockroachdb/pebble/internal/base"
)

// The Annotator type defined below is used by other packages to lazily
// compute a value over a B-Tree. Each node of the B-Tree stores one
// `annotation` per annotator, containing the result of the computation over
Expand All @@ -24,6 +30,10 @@ package manifest
// computed incrementally as edits are applied to a level.
type Annotator[T any] struct {
Aggregator AnnotationAggregator[T]

// scratch is used to hold the aggregated annotation value when computing
// range annotations in order to avoid additional allocations.
scratch *T
}

// An AnnotationAggregator defines how an annotation should be accumulated
Expand Down Expand Up @@ -116,6 +126,80 @@ func (a *Annotator[T]) nodeAnnotation(n *node) (_ *T, cacheOK bool) {
return t, annot.valid
}

// accumulateRangeAnnotation computes this annotator's annotation across all
// files in the node's subtree which overlap with the range defined by bounds.
// The computed annotation is accumulated into a.scratch.
func (a *Annotator[T]) accumulateRangeAnnotation(
n *node,
cmp base.Compare,
bounds base.UserKeyBounds,
// fullyWithinLowerBound and fullyWithinUpperBound indicate whether this
// node's subtree is already known to be within each bound.
fullyWithinLowerBound bool,
fullyWithinUpperBound bool,
) {
// If this node's subtree is fully within the bounds, compute a regular
// annotation.
if fullyWithinLowerBound && fullyWithinUpperBound {
v, _ := a.nodeAnnotation(n)
a.scratch = a.Aggregator.Merge(v, a.scratch)
return
}

// We will accumulate annotations from each item in the end-exclusive
// range [leftItem, rightItem).
leftItem, rightItem := 0, int(n.count)
if !fullyWithinLowerBound {
// leftItem is the index of the first item that overlaps the lower bound.
leftItem = sort.Search(int(n.count), func(i int) bool {
return cmp(bounds.Start, n.items[i].Largest.UserKey) <= 0
})
}
if !fullyWithinUpperBound {
// rightItem is the index of the first item that does not overlap the
// upper bound.
rightItem = sort.Search(int(n.count), func(i int) bool {
return !bounds.End.IsUpperBoundFor(cmp, n.items[i].Smallest.UserKey)
})
}

// Accumulate annotations from every item that overlaps the bounds.
for i := leftItem; i < rightItem; i++ {
v, _ := a.Aggregator.Accumulate(n.items[i], a.scratch)
a.scratch = v
}

if !n.leaf {
// We will accumulate annotations from each child in the end-inclusive
// range [leftChild, rightChild].
leftChild, rightChild := leftItem, rightItem
// If the lower bound overlaps with the child at leftItem, there is no
// need to accumulate annotations from the child to its left.
if leftItem < int(n.count) && cmp(bounds.Start, n.items[leftItem].Smallest.UserKey) >= 0 {
leftChild++
}
// If the upper bound spans beyond the child at rightItem, we must also
// accumulate annotations from the child to its right.
if rightItem < int(n.count) && bounds.End.IsUpperBoundFor(cmp, n.items[rightItem].Largest.UserKey) {
rightChild++
}

for i := leftChild; i <= rightChild; i++ {
a.accumulateRangeAnnotation(
n.children[i],
cmp,
bounds,
// If this child is to the right of leftItem, then its entire
// subtree is within the lower bound.
fullyWithinLowerBound || i > leftItem,
// If this child is to the left of rightItem, then its entire
// subtree is within the upper bound.
fullyWithinUpperBound || i < rightItem,
)
}
}
}

// InvalidateAnnotation removes any existing cached annotations from this
// annotator from a node's subtree.
func (a *Annotator[T]) invalidateNodeAnnotation(n *node) {
Expand All @@ -142,8 +226,8 @@ func (a *Annotator[T]) LevelAnnotation(lm LevelMetadata) *T {
return v
}

// LevelAnnotation calculates the annotation defined by this Annotator for all
// files across the given levels. A pointer to the Annotator is used as the
// MultiLevelAnnotation calculates the annotation defined by this Annotator for
// all files across the given levels. A pointer to the Annotator is used as the
// key for pre-calculated values, so the same Annotator must be used to avoid
// duplicate computation. Annotation must not be called concurrently, and in
// practice this is achieved by requiring callers to hold DB.mu.
Expand All @@ -158,6 +242,22 @@ func (a *Annotator[T]) MultiLevelAnnotation(lms []LevelMetadata) *T {
return aggregated
}

// LevelRangeAnnotation calculates the annotation defined by this Annotator for
// the files within LevelMetadata which are within the range
// [lowerBound, upperBound). A pointer to the Annotator is used as the key for
// pre-calculated values, so the same Annotator must be used to avoid duplicate
// computation. Annotation must not be called concurrently, and in practice this
// is achieved by requiring callers to hold DB.mu.
func (a *Annotator[T]) LevelRangeAnnotation(lm LevelMetadata, bounds base.UserKeyBounds) *T {
if lm.Empty() {
return a.Aggregator.Zero(nil)
}

a.scratch = a.Aggregator.Zero(a.scratch)
a.accumulateRangeAnnotation(lm.tree.root, lm.tree.cmp, bounds, false, false)
return a.scratch
}

// InvalidateAnnotation clears any cached annotations defined by Annotator. A
// pointer to the Annotator is used as the key for pre-calculated values, so
// the same Annotator must be used to clear the appropriate cached annotation.
Expand Down Expand Up @@ -206,6 +306,14 @@ func SumAnnotator(accumulate func(f *FileMetadata) (v uint64, cacheOK bool)) *An
}
}

// NumFilesAnnotator is an Annotator which computes an annotation value
// equal to the number of files included in the annotation. Particularly, it
// can be used to efficiently calculate the number of files in a given key
// range using range annotations.
var NumFilesAnnotator = SumAnnotator(func(f *FileMetadata) (uint64, bool) {
return 1, true
})

// PickFileAggregator implements the AnnotationAggregator interface. It defines
// an aggregator that picks a single file from a set of eligible files.
type PickFileAggregator struct {
Expand Down
158 changes: 127 additions & 31 deletions internal/manifest/annotator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,54 +5,47 @@
package manifest

import (
"math/rand"
"testing"

"github.com/cockroachdb/pebble/internal/base"
"github.com/stretchr/testify/require"
)

func makeTestLevelMetadata(count int) (LevelMetadata, []*FileMetadata) {
files := make([]*FileMetadata, count)
for i := 0; i < count; i++ {
files[i] = newItem(key(i))
// Creates a version with numFiles files in level 6.
func makeTestVersion(numFiles int) (*Version, []*FileMetadata) {
files := make([]*FileMetadata, numFiles)
for i := 0; i < numFiles; i++ {
// Each file spans 10 keys, e.g. [0->9], [10->19], etc.
files[i] = (&FileMetadata{}).ExtendPointKeyBounds(
base.DefaultComparer.Compare, key(i*10), key(i*10+9),
)
files[i].InitPhysicalBacking()
}

lm := MakeLevelMetadata(base.DefaultComparer.Compare, 6, files)
return lm, files
}
var levelFiles [7][]*FileMetadata
levelFiles[6] = files

// NumFilesAnnotator is an Annotator which computes an annotation value
// equal to the number of files included in the annotation.
var NumFilesAnnotator = SumAnnotator(func(f *FileMetadata) (uint64, bool) {
return 1, true
})
v := NewVersion(base.DefaultComparer, 0, levelFiles)
return v, files
}

func TestNumFilesAnnotator(t *testing.T) {
const count = 1000
lm, _ := makeTestLevelMetadata(0)
v, _ := makeTestVersion(0)

for i := 1; i <= count; i++ {
lm.tree.Insert(newItem(key(i)))
numFiles := *NumFilesAnnotator.LevelAnnotation(lm)
v.Levels[6].tree.Insert(newItem(key(i)))
numFiles := *NumFilesAnnotator.LevelAnnotation(v.Levels[6])
require.EqualValues(t, i, numFiles)
}

numFiles := *NumFilesAnnotator.LevelAnnotation(lm)
require.EqualValues(t, count, numFiles)

numFiles = *NumFilesAnnotator.LevelAnnotation(lm)
require.EqualValues(t, count, numFiles)

lm.tree.Delete(newItem(key(count / 2)))
numFiles = *NumFilesAnnotator.LevelAnnotation(lm)
require.EqualValues(t, count-1, numFiles)
}

func BenchmarkNumFilesAnnotator(b *testing.B) {
lm, _ := makeTestLevelMetadata(0)
v, _ := makeTestVersion(0)
for i := 1; i <= b.N; i++ {
lm.tree.Insert(newItem(key(i)))
numFiles := *NumFilesAnnotator.LevelAnnotation(lm)
v.Levels[6].tree.Insert(newItem(key(i)))
numFiles := *NumFilesAnnotator.LevelAnnotation(v.Levels[6])
require.EqualValues(b, uint64(i), numFiles)
}
}
Expand All @@ -70,12 +63,115 @@ func TestPickFileAggregator(t *testing.T) {
},
}

lm, files := makeTestLevelMetadata(1)
v, files := makeTestVersion(1)

for i := 1; i <= count; i++ {
lm.tree.Insert(newItem(key(i)))
pickedFile := a.LevelAnnotation(lm)
v.Levels[6].tree.Insert(newItem(key(i)))
pickedFile := a.LevelAnnotation(v.Levels[6])
// The picked file should always be the one with the smallest key.
require.Same(t, files[0], pickedFile)
}
}

func bounds(i int, j int, exclusive bool) base.UserKeyBounds {
b := base.UserKeyBoundsEndExclusiveIf(key(i).UserKey, key(j).UserKey, exclusive)
return b
}

func randomBounds(rng *rand.Rand, count int) base.UserKeyBounds {
first := rng.Intn(count)
second := rng.Intn(count)
exclusive := rng.Intn(2) == 0
return bounds(min(first, second), max(first, second), exclusive)
}

func requireMatchOverlaps(t *testing.T, v *Version, bounds base.UserKeyBounds) {
overlaps := v.Overlaps(6, bounds)
numFiles := *NumFilesAnnotator.LevelRangeAnnotation(v.Levels[6], bounds)
require.EqualValues(t, overlaps.length, numFiles)
}

func TestNumFilesRangeAnnotationEmptyRanges(t *testing.T) {
const count = 5_000
v, files := makeTestVersion(count)

// Delete files containing key ranges [0, 999] and [24_000, 25_999].
for i := 0; i < 100; i++ {
v.Levels[6].tree.Delete(files[i])
}
for i := 2400; i < 2600; i++ {
v.Levels[6].tree.Delete(files[i])
}

// Ranges that are completely empty.
requireMatchOverlaps(t, v, bounds(1, 999, false))
requireMatchOverlaps(t, v, bounds(0, 1000, true))
requireMatchOverlaps(t, v, bounds(50_000, 60_000, false))
requireMatchOverlaps(t, v, bounds(24_500, 25_500, false))
requireMatchOverlaps(t, v, bounds(24_000, 26_000, true))

// Partial overlaps with empty ranges.
requireMatchOverlaps(t, v, bounds(0, 1000, false))
requireMatchOverlaps(t, v, bounds(20, 1001, true))
requireMatchOverlaps(t, v, bounds(20, 1010, true))
requireMatchOverlaps(t, v, bounds(23_000, 27_000, true))
requireMatchOverlaps(t, v, bounds(25_000, 40_000, false))
requireMatchOverlaps(t, v, bounds(25_500, 26_001, true))

// Ranges which only spans a single table.
requireMatchOverlaps(t, v, bounds(45_000, 45_000, true))
requireMatchOverlaps(t, v, bounds(30_000, 30_001, true))
requireMatchOverlaps(t, v, bounds(23_000, 23_000, false))
}

func TestNumFilesRangeAnnotationRandomized(t *testing.T) {
const count = 10_000
const numIterations = 10_000

v, _ := makeTestVersion(count)

rng := rand.New(rand.NewSource(int64(0)))
for i := 0; i < numIterations; i++ {
requireMatchOverlaps(t, v, randomBounds(rng, count*11))
}
}

func BenchmarkNumFilesRangeAnnotation(b *testing.B) {
const count = 100_000
v, files := makeTestVersion(count)

rng := rand.New(rand.NewSource(int64(0)))
b.Run("annotator", func(b *testing.B) {
for i := 0; i < b.N; i++ {
b := randomBounds(rng, count*11)
// Randomly delete and reinsert a file to verify that range
// annotations are still fast despite small mutations.
toDelete := rng.Intn(count)
v.Levels[6].tree.Delete(files[toDelete])

NumFilesAnnotator.LevelRangeAnnotation(v.Levels[6], b)

v.Levels[6].tree.Insert(files[toDelete])
}
})

// Also benchmark an equivalent aggregation using version.Overlaps to show
// the difference in performance.
b.Run("overlaps", func(b *testing.B) {
for i := 0; i < b.N; i++ {
b := randomBounds(rng, count*11)
toDelete := rng.Intn(count)
v.Levels[6].tree.Delete(files[toDelete])

overlaps := v.Overlaps(6, b)
iter := overlaps.Iter()
numFiles := 0
for f := iter.First(); f != nil; f = iter.Next() {
numFiles++
}

v.Levels[6].tree.Insert(files[toDelete])
}
})

}
Loading

0 comments on commit 4776fa3

Please sign in to comment.