Skip to content

Commit

Permalink
bboltcachestorage: mitigate corrupt boltdb cache after panic
Browse files Browse the repository at this point in the history
There are some reports that the nosync configuration of the boltdb can
cause panics on restarts due to corruption of the database. Mitigate by
panic recovery until there is a better solution.

Co-authored-by: Tonis Tiigi <[email protected]>
Signed-off-by: Jonathan A. Sternberg <[email protected]>
  • Loading branch information
tonistiigi authored and jsternberg committed May 31, 2024
1 parent 1c55173 commit 119abaa
Showing 1 changed file with 54 additions and 2 deletions.
56 changes: 54 additions & 2 deletions solver/bboltcachestorage/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@ import (
"bytes"
"encoding/json"
"fmt"
"os"

"github.com/moby/buildkit/identity"
"github.com/moby/buildkit/solver"
"github.com/moby/buildkit/util/bklog"
digest "github.com/opencontainers/go-digest"
"github.com/pkg/errors"
bolt "go.etcd.io/bbolt"
Expand All @@ -23,10 +26,12 @@ type Store struct {
}

func NewStore(dbPath string) (*Store, error) {
db, err := bolt.Open(dbPath, 0600, nil)
db, err := safeOpenDB(dbPath)
if err != nil {
return nil, errors.Wrapf(err, "failed to open database file %s", dbPath)
return nil, err
}

// Initialize the database with the needed buckets if they do not exist.
if err := db.Update(func(tx *bolt.Tx) error {
for _, b := range []string{resultBucket, linksBucket, byResultBucket, backlinksBucket} {
if _, err := tx.CreateBucketIfNotExists([]byte(b)); err != nil {
Expand Down Expand Up @@ -455,3 +460,50 @@ func isEmptyBucket(b *bolt.Bucket) bool {
k, _ := b.Cursor().First()
return k == nil
}

// safeOpenDB opens a bolt database and recovers from panic that
// can be caused by a corrupted database file.
func safeOpenDB(dbPath string) (db *bolt.DB, err error) {
defer func() {
if r := recover(); r != nil {
err = errors.Errorf("%v", r)
}

// If we get an error when opening the database, but we have
// access to the file, then fallback to resetting the database
// since the database may be corrupt.
if err != nil && canAccessFile(dbPath) {
db, err = fallbackOpenDB(dbPath, err)
}
}()
return openDB(dbPath)
}

// fallbackOpenDB performs the panic recovery, database backup, and
// opening of the new database file when a panic happens from safeOpenDB.
func fallbackOpenDB(dbPath string, openErr error) (*bolt.DB, error) {
backupPath := dbPath + "." + identity.NewID() + ".bak"
bklog.L.Errorf("failed to open database file %s, resetting to empty. Old database is backed up to %s. "+
"This error signifies that buildkitd likely crashed or was sigkilled abrubtly, leaving the database corrupted. "+
"If you see logs from a previous panic then please report in the issue tracker at https:/moby/buildkit . %+v", dbPath, backupPath, openErr)
if err := os.Rename(dbPath, backupPath); err != nil {
return nil, errors.Wrapf(err, "failed to rename database file %s to %s", dbPath, backupPath)
}

// Attempt to open the database again. This should be a new database.
// If this fails, it is a permanent error.
return openDB(dbPath)
}

// openDB opens a bolt database in user-only read/write mode.
func openDB(dbPath string) (*bolt.DB, error) {
return bolt.Open(dbPath, 0600, nil)
}

// canAccessFile checks if we have access to the file with appropriate
// permissions. This will return true if the file exists and has read
// permissions.
func canAccessFile(dbPath string) bool {
_, err := os.Stat(dbPath)
return err == nil
}

0 comments on commit 119abaa

Please sign in to comment.