cache: handle crash after snapshot commit

Before this, there could be crash during a call to finalize a ref that
occured after the snapshot was committed but before committing the
metadata that indicates the immutable ref no longer had an
equalMutable. This resulted in a situation where any future calls to
finalize that ref would fail.

Now, if that situation happens, the cache will notice when it's
initially loaded that the ref has an equalMutable that's missing its
snapshot and that its own snapshot exists. It will then just use the
correctly committed snapshot and clear the equalMutable field.

Signed-off-by: Erik Sipsma <erik@sipsma.dev>
master
Erik Sipsma 2022-01-14 13:41:06 -08:00
parent 17c237d69a
commit eb935e525f
2 changed files with 118 additions and 16 deletions

53
cache/manager.go vendored
View File

@ -410,25 +410,32 @@ func (cm *cacheManager) getRecord(ctx context.Context, id string, opts ...RefOpt
if mutableID := md.getEqualMutable(); mutableID != "" {
mutable, err := cm.getRecord(ctx, mutableID)
if err != nil {
// check loading mutable deleted record from disk
if IsNotFound(err) {
cm.MetadataStore.Clear(id)
if err == nil {
rec := &cacheRecord{
mu: &sync.Mutex{},
cm: cm,
refs: make(map[ref]struct{}),
parentRefs: parents,
cacheMetadata: md,
equalMutable: &mutableRef{cacheRecord: mutable},
}
mutable.equalImmutable = &immutableRef{cacheRecord: rec}
cm.records[id] = rec
return rec, nil
} else if IsNotFound(err) {
// The equal mutable for this ref is not found, check to see if our snapshot exists
if _, statErr := cm.Snapshotter.Stat(ctx, md.getSnapshotID()); statErr != nil {
// this ref's snapshot also doesn't exist, just remove this record
cm.MetadataStore.Clear(id)
return nil, errors.Wrap(errNotFound, id)
}
// Our snapshot exists, so there may have been a crash while finalizing this ref.
// Clear the equal mutable field and continue using this ref.
md.clearEqualMutable()
md.commitMetadata()
} else {
return nil, err
}
rec := &cacheRecord{
mu: &sync.Mutex{},
cm: cm,
refs: make(map[ref]struct{}),
parentRefs: parents,
cacheMetadata: md,
equalMutable: &mutableRef{cacheRecord: mutable},
}
mutable.equalImmutable = &immutableRef{cacheRecord: rec}
cm.records[id] = rec
return rec, nil
}
rec := &cacheRecord{
@ -448,6 +455,20 @@ func (cm *cacheManager) getRecord(ctx context.Context, id string, opts ...RefOpt
return nil, errors.Wrapf(errNotFound, "failed to get deleted record %s", id)
}
if rec.mutable {
// If the record is mutable, then the snapshot must exist
if _, err := cm.Snapshotter.Stat(ctx, rec.ID()); err != nil {
if !errdefs.IsNotFound(err) {
return nil, errors.Wrap(err, "failed to check mutable ref snapshot")
}
// the snapshot doesn't exist, clear this record
if err := rec.remove(ctx, true); err != nil {
return nil, errors.Wrap(err, "failed to remove mutable rec with missing snapshot")
}
return nil, errors.Wrap(errNotFound, rec.ID())
}
}
if err := initializeMetadata(rec.cacheMetadata, rec.parentRefs, opts...); err != nil {
return nil, err
}

81
cache/manager_test.go vendored
View File

@ -1684,6 +1684,87 @@ func TestMergeOp(t *testing.T) {
checkDiskUsage(ctx, t, cm, 0, 0)
}
func TestLoadHalfFinalizedRef(t *testing.T) {
// This test simulates the situation where a ref w/ an equalMutable has its
// snapshot committed but there is a crash before the metadata is updated to
// clear the equalMutable field. It's expected that the mutable will be
// removed and the immutable ref will continue to be usable.
t.Parallel()
ctx := namespaces.WithNamespace(context.Background(), "buildkit-test")
tmpdir, err := ioutil.TempDir("", "cachemanager")
require.NoError(t, err)
defer os.RemoveAll(tmpdir)
snapshotter, err := native.NewSnapshotter(filepath.Join(tmpdir, "snapshots"))
require.NoError(t, err)
co, cleanup, err := newCacheManager(ctx, cmOpt{
tmpdir: tmpdir,
snapshotter: snapshotter,
snapshotterName: "native",
})
require.NoError(t, err)
defer cleanup()
cm := co.manager.(*cacheManager)
mref, err := cm.New(ctx, nil, nil, CachePolicyRetain)
require.NoError(t, err)
mutRef := mref.(*mutableRef)
iref, err := mutRef.Commit(ctx)
require.NoError(t, err)
immutRef := iref.(*immutableRef)
require.NoError(t, mref.Release(ctx))
_, err = co.lm.Create(ctx, func(l *leases.Lease) error {
l.ID = immutRef.ID()
l.Labels = map[string]string{
"containerd.io/gc.flat": time.Now().UTC().Format(time.RFC3339Nano),
}
return nil
})
require.NoError(t, err)
err = co.lm.AddResource(ctx, leases.Lease{ID: immutRef.ID()}, leases.Resource{
ID: immutRef.getSnapshotID(),
Type: "snapshots/" + cm.Snapshotter.Name(),
})
require.NoError(t, err)
err = cm.Snapshotter.Commit(ctx, immutRef.getSnapshotID(), mutRef.getSnapshotID())
require.NoError(t, err)
_, err = cm.Snapshotter.Stat(ctx, mutRef.getSnapshotID())
require.Error(t, err)
require.NoError(t, iref.Release(ctx))
require.NoError(t, cm.Close())
require.NoError(t, cleanup())
co, cleanup, err = newCacheManager(ctx, cmOpt{
tmpdir: tmpdir,
snapshotter: snapshotter,
snapshotterName: "native",
})
require.NoError(t, err)
defer cleanup()
cm = co.manager.(*cacheManager)
_, err = cm.GetMutable(ctx, mutRef.ID())
require.ErrorIs(t, err, errNotFound)
iref, err = cm.Get(ctx, immutRef.ID())
require.NoError(t, err)
require.NoError(t, iref.Finalize(ctx))
immutRef = iref.(*immutableRef)
_, err = cm.Snapshotter.Stat(ctx, immutRef.getSnapshotID())
require.NoError(t, err)
}
func TestMountReadOnly(t *testing.T) {
t.Parallel()
if runtime.GOOS != "linux" {