From eb935e525ff72decf2ca8998aaa08710c8e7820a Mon Sep 17 00:00:00 2001 From: Erik Sipsma Date: Fri, 14 Jan 2022 13:41:06 -0800 Subject: [PATCH] cache: handle crash after snapshot commit Before this, there could be crash during a call to finalize a ref that occured after the snapshot was committed but before committing the metadata that indicates the immutable ref no longer had an equalMutable. This resulted in a situation where any future calls to finalize that ref would fail. Now, if that situation happens, the cache will notice when it's initially loaded that the ref has an equalMutable that's missing its snapshot and that its own snapshot exists. It will then just use the correctly committed snapshot and clear the equalMutable field. Signed-off-by: Erik Sipsma --- cache/manager.go | 53 +++++++++++++++++++--------- cache/manager_test.go | 81 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 16 deletions(-) diff --git a/cache/manager.go b/cache/manager.go index fe22fc07..36dac6fe 100644 --- a/cache/manager.go +++ b/cache/manager.go @@ -410,25 +410,32 @@ func (cm *cacheManager) getRecord(ctx context.Context, id string, opts ...RefOpt if mutableID := md.getEqualMutable(); mutableID != "" { mutable, err := cm.getRecord(ctx, mutableID) - if err != nil { - // check loading mutable deleted record from disk - if IsNotFound(err) { - cm.MetadataStore.Clear(id) + if err == nil { + rec := &cacheRecord{ + mu: &sync.Mutex{}, + cm: cm, + refs: make(map[ref]struct{}), + parentRefs: parents, + cacheMetadata: md, + equalMutable: &mutableRef{cacheRecord: mutable}, } + mutable.equalImmutable = &immutableRef{cacheRecord: rec} + cm.records[id] = rec + return rec, nil + } else if IsNotFound(err) { + // The equal mutable for this ref is not found, check to see if our snapshot exists + if _, statErr := cm.Snapshotter.Stat(ctx, md.getSnapshotID()); statErr != nil { + // this ref's snapshot also doesn't exist, just remove this record + cm.MetadataStore.Clear(id) + return nil, errors.Wrap(errNotFound, id) + } + // Our snapshot exists, so there may have been a crash while finalizing this ref. + // Clear the equal mutable field and continue using this ref. + md.clearEqualMutable() + md.commitMetadata() + } else { return nil, err } - - rec := &cacheRecord{ - mu: &sync.Mutex{}, - cm: cm, - refs: make(map[ref]struct{}), - parentRefs: parents, - cacheMetadata: md, - equalMutable: &mutableRef{cacheRecord: mutable}, - } - mutable.equalImmutable = &immutableRef{cacheRecord: rec} - cm.records[id] = rec - return rec, nil } rec := &cacheRecord{ @@ -448,6 +455,20 @@ func (cm *cacheManager) getRecord(ctx context.Context, id string, opts ...RefOpt return nil, errors.Wrapf(errNotFound, "failed to get deleted record %s", id) } + if rec.mutable { + // If the record is mutable, then the snapshot must exist + if _, err := cm.Snapshotter.Stat(ctx, rec.ID()); err != nil { + if !errdefs.IsNotFound(err) { + return nil, errors.Wrap(err, "failed to check mutable ref snapshot") + } + // the snapshot doesn't exist, clear this record + if err := rec.remove(ctx, true); err != nil { + return nil, errors.Wrap(err, "failed to remove mutable rec with missing snapshot") + } + return nil, errors.Wrap(errNotFound, rec.ID()) + } + } + if err := initializeMetadata(rec.cacheMetadata, rec.parentRefs, opts...); err != nil { return nil, err } diff --git a/cache/manager_test.go b/cache/manager_test.go index d5cd5d14..80d465b2 100644 --- a/cache/manager_test.go +++ b/cache/manager_test.go @@ -1684,6 +1684,87 @@ func TestMergeOp(t *testing.T) { checkDiskUsage(ctx, t, cm, 0, 0) } +func TestLoadHalfFinalizedRef(t *testing.T) { + // This test simulates the situation where a ref w/ an equalMutable has its + // snapshot committed but there is a crash before the metadata is updated to + // clear the equalMutable field. It's expected that the mutable will be + // removed and the immutable ref will continue to be usable. + t.Parallel() + + ctx := namespaces.WithNamespace(context.Background(), "buildkit-test") + + tmpdir, err := ioutil.TempDir("", "cachemanager") + require.NoError(t, err) + defer os.RemoveAll(tmpdir) + + snapshotter, err := native.NewSnapshotter(filepath.Join(tmpdir, "snapshots")) + require.NoError(t, err) + + co, cleanup, err := newCacheManager(ctx, cmOpt{ + tmpdir: tmpdir, + snapshotter: snapshotter, + snapshotterName: "native", + }) + require.NoError(t, err) + defer cleanup() + cm := co.manager.(*cacheManager) + + mref, err := cm.New(ctx, nil, nil, CachePolicyRetain) + require.NoError(t, err) + mutRef := mref.(*mutableRef) + + iref, err := mutRef.Commit(ctx) + require.NoError(t, err) + immutRef := iref.(*immutableRef) + + require.NoError(t, mref.Release(ctx)) + + _, err = co.lm.Create(ctx, func(l *leases.Lease) error { + l.ID = immutRef.ID() + l.Labels = map[string]string{ + "containerd.io/gc.flat": time.Now().UTC().Format(time.RFC3339Nano), + } + return nil + }) + require.NoError(t, err) + err = co.lm.AddResource(ctx, leases.Lease{ID: immutRef.ID()}, leases.Resource{ + ID: immutRef.getSnapshotID(), + Type: "snapshots/" + cm.Snapshotter.Name(), + }) + require.NoError(t, err) + + err = cm.Snapshotter.Commit(ctx, immutRef.getSnapshotID(), mutRef.getSnapshotID()) + require.NoError(t, err) + + _, err = cm.Snapshotter.Stat(ctx, mutRef.getSnapshotID()) + require.Error(t, err) + + require.NoError(t, iref.Release(ctx)) + + require.NoError(t, cm.Close()) + require.NoError(t, cleanup()) + + co, cleanup, err = newCacheManager(ctx, cmOpt{ + tmpdir: tmpdir, + snapshotter: snapshotter, + snapshotterName: "native", + }) + require.NoError(t, err) + defer cleanup() + cm = co.manager.(*cacheManager) + + _, err = cm.GetMutable(ctx, mutRef.ID()) + require.ErrorIs(t, err, errNotFound) + + iref, err = cm.Get(ctx, immutRef.ID()) + require.NoError(t, err) + require.NoError(t, iref.Finalize(ctx)) + immutRef = iref.(*immutableRef) + + _, err = cm.Snapshotter.Stat(ctx, immutRef.getSnapshotID()) + require.NoError(t, err) +} + func TestMountReadOnly(t *testing.T) { t.Parallel() if runtime.GOOS != "linux" {