Merge pull request #2564 from sipsma/fix-2557

cache: handle crash after snapshot commit
master
Tõnis Tiigi 2022-01-19 00:41:04 -08:00 committed by GitHub
commit e47e047569
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 118 additions and 16 deletions

53
cache/manager.go vendored
View File

@ -410,25 +410,32 @@ func (cm *cacheManager) getRecord(ctx context.Context, id string, opts ...RefOpt
if mutableID := md.getEqualMutable(); mutableID != "" {
mutable, err := cm.getRecord(ctx, mutableID)
if err != nil {
// check loading mutable deleted record from disk
if IsNotFound(err) {
cm.MetadataStore.Clear(id)
if err == nil {
rec := &cacheRecord{
mu: &sync.Mutex{},
cm: cm,
refs: make(map[ref]struct{}),
parentRefs: parents,
cacheMetadata: md,
equalMutable: &mutableRef{cacheRecord: mutable},
}
mutable.equalImmutable = &immutableRef{cacheRecord: rec}
cm.records[id] = rec
return rec, nil
} else if IsNotFound(err) {
// The equal mutable for this ref is not found, check to see if our snapshot exists
if _, statErr := cm.Snapshotter.Stat(ctx, md.getSnapshotID()); statErr != nil {
// this ref's snapshot also doesn't exist, just remove this record
cm.MetadataStore.Clear(id)
return nil, errors.Wrap(errNotFound, id)
}
// Our snapshot exists, so there may have been a crash while finalizing this ref.
// Clear the equal mutable field and continue using this ref.
md.clearEqualMutable()
md.commitMetadata()
} else {
return nil, err
}
rec := &cacheRecord{
mu: &sync.Mutex{},
cm: cm,
refs: make(map[ref]struct{}),
parentRefs: parents,
cacheMetadata: md,
equalMutable: &mutableRef{cacheRecord: mutable},
}
mutable.equalImmutable = &immutableRef{cacheRecord: rec}
cm.records[id] = rec
return rec, nil
}
rec := &cacheRecord{
@ -448,6 +455,20 @@ func (cm *cacheManager) getRecord(ctx context.Context, id string, opts ...RefOpt
return nil, errors.Wrapf(errNotFound, "failed to get deleted record %s", id)
}
if rec.mutable {
// If the record is mutable, then the snapshot must exist
if _, err := cm.Snapshotter.Stat(ctx, rec.ID()); err != nil {
if !errdefs.IsNotFound(err) {
return nil, errors.Wrap(err, "failed to check mutable ref snapshot")
}
// the snapshot doesn't exist, clear this record
if err := rec.remove(ctx, true); err != nil {
return nil, errors.Wrap(err, "failed to remove mutable rec with missing snapshot")
}
return nil, errors.Wrap(errNotFound, rec.ID())
}
}
if err := initializeMetadata(rec.cacheMetadata, rec.parentRefs, opts...); err != nil {
return nil, err
}

81
cache/manager_test.go vendored
View File

@ -1684,6 +1684,87 @@ func TestMergeOp(t *testing.T) {
checkDiskUsage(ctx, t, cm, 0, 0)
}
func TestLoadHalfFinalizedRef(t *testing.T) {
// This test simulates the situation where a ref w/ an equalMutable has its
// snapshot committed but there is a crash before the metadata is updated to
// clear the equalMutable field. It's expected that the mutable will be
// removed and the immutable ref will continue to be usable.
t.Parallel()
ctx := namespaces.WithNamespace(context.Background(), "buildkit-test")
tmpdir, err := ioutil.TempDir("", "cachemanager")
require.NoError(t, err)
defer os.RemoveAll(tmpdir)
snapshotter, err := native.NewSnapshotter(filepath.Join(tmpdir, "snapshots"))
require.NoError(t, err)
co, cleanup, err := newCacheManager(ctx, cmOpt{
tmpdir: tmpdir,
snapshotter: snapshotter,
snapshotterName: "native",
})
require.NoError(t, err)
defer cleanup()
cm := co.manager.(*cacheManager)
mref, err := cm.New(ctx, nil, nil, CachePolicyRetain)
require.NoError(t, err)
mutRef := mref.(*mutableRef)
iref, err := mutRef.Commit(ctx)
require.NoError(t, err)
immutRef := iref.(*immutableRef)
require.NoError(t, mref.Release(ctx))
_, err = co.lm.Create(ctx, func(l *leases.Lease) error {
l.ID = immutRef.ID()
l.Labels = map[string]string{
"containerd.io/gc.flat": time.Now().UTC().Format(time.RFC3339Nano),
}
return nil
})
require.NoError(t, err)
err = co.lm.AddResource(ctx, leases.Lease{ID: immutRef.ID()}, leases.Resource{
ID: immutRef.getSnapshotID(),
Type: "snapshots/" + cm.Snapshotter.Name(),
})
require.NoError(t, err)
err = cm.Snapshotter.Commit(ctx, immutRef.getSnapshotID(), mutRef.getSnapshotID())
require.NoError(t, err)
_, err = cm.Snapshotter.Stat(ctx, mutRef.getSnapshotID())
require.Error(t, err)
require.NoError(t, iref.Release(ctx))
require.NoError(t, cm.Close())
require.NoError(t, cleanup())
co, cleanup, err = newCacheManager(ctx, cmOpt{
tmpdir: tmpdir,
snapshotter: snapshotter,
snapshotterName: "native",
})
require.NoError(t, err)
defer cleanup()
cm = co.manager.(*cacheManager)
_, err = cm.GetMutable(ctx, mutRef.ID())
require.ErrorIs(t, err, errNotFound)
iref, err = cm.Get(ctx, immutRef.ID())
require.NoError(t, err)
require.NoError(t, iref.Finalize(ctx))
immutRef = iref.(*immutableRef)
_, err = cm.Snapshotter.Stat(ctx, immutRef.getSnapshotID())
require.NoError(t, err)
}
func TestMountReadOnly(t *testing.T) {
t.Parallel()
if runtime.GOOS != "linux" {