Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 70 additions & 42 deletions packages/orchestrator/internal/sandbox/sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/trace"
"go.uber.org/zap"
"golang.org/x/sync/errgroup"

"github.com/e2b-dev/infra/packages/orchestrator/internal/cfg"
"github.com/e2b-dev/infra/packages/orchestrator/internal/sandbox/block"
Expand Down Expand Up @@ -742,11 +743,6 @@ func (s *Sandbox) Pause(
}
cleanup.AddNoContext(ctx, snapshotTemplateFiles.Close)

buildID, err := uuid.Parse(snapshotTemplateFiles.BuildID)
if err != nil {
return nil, fmt.Errorf("failed to parse build id: %w", err)
}

// Stop the health check before pausing the VM
s.Checks.Stop()

Expand All @@ -763,50 +759,82 @@ func (s *Sandbox) Pause(
return nil, fmt.Errorf("error creating snapshot: %w", err)
}

// Gather data for postprocessing
originalMemfile, err := s.Template.Memfile(ctx)
buildID, err := uuid.Parse(snapshotTemplateFiles.BuildID)
if err != nil {
return nil, fmt.Errorf("failed to get original memfile: %w", err)
return nil, fmt.Errorf("failed to parse build id: %w", err)
}

originalRootfs, err := s.Template.Rootfs()
if err != nil {
return nil, fmt.Errorf("failed to get original rootfs: %w", err)
}
eg, egCtx := errgroup.WithContext(ctx)

memfileDiffMetadata, err := s.Resources.memory.DiffMetadata(ctx)
if err != nil {
return nil, fmt.Errorf("failed to get memfile metadata: %w", err)
}
var memfileDiff build.Diff
var memfileDiffHeader *header.Header

// Start POSTPROCESSING
memfileDiff, memfileDiffHeader, err := pauseProcessMemory(
ctx,
buildID,
originalMemfile.Header(),
memfileDiffMetadata,
s.config.DefaultCacheDir,
s.process,
)
if err != nil {
return nil, fmt.Errorf("error while post processing: %w", err)
}
cleanup.AddNoContext(ctx, memfileDiff.Close)
eg.Go(func() error {
originalMemfile, err := s.Template.Memfile(egCtx)
if err != nil {
return fmt.Errorf("failed to get original memfile: %w", err)
}

rootfsDiff, rootfsDiffHeader, err := pauseProcessRootfs(
ctx,
buildID,
originalRootfs.Header(),
&RootfsDiffCreator{
rootfs: s.rootfs,
closeHook: s.Close,
},
s.config.DefaultCacheDir,
)
if err != nil {
return nil, fmt.Errorf("error while post processing: %w", err)
diffMetadata, err := s.Resources.memory.DiffMetadata(egCtx)
if err != nil {
return fmt.Errorf("failed to get memfile metadata: %w", err)
}

diff, header, memfileErr := pauseProcessMemory(
egCtx,
buildID,
originalMemfile.Header(),
diffMetadata,
s.config.DefaultCacheDir,
s.process,
)
if memfileErr != nil {
return fmt.Errorf("error while post processing memory: %w", memfileErr)
}

memfileDiff = diff
memfileDiffHeader = header

cleanup.AddNoContext(egCtx, memfileDiff.Close)

return nil
})

var rootfsDiff build.Diff
var rootfsDiffHeader *header.Header

eg.Go(func() error {
originalRootfs, err := s.Template.Rootfs()
if err != nil {
return fmt.Errorf("failed to get original rootfs: %w", err)
}

diff, header, rootfsErr := pauseProcessRootfs(
egCtx,
buildID,
originalRootfs.Header(),
&RootfsDiffCreator{
rootfs: s.rootfs,
closeHook: s.Close,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is running the cleanup in parallel while taking the memory alright?

},
Comment on lines +816 to +819

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Avoid closing sandbox before memory export completes

This rootfs diff runs in parallel with the memfile diff, but RootfsDiffCreator ultimately calls rootfs.ExportDiff, and the NBD implementation spawns a goroutine that invokes closeSandbox immediately (see packages/orchestrator/internal/sandbox/rootfs/nbd.go where closeSandbox(ctx) is called before diff export completes). Because pauseProcessMemory uses s.process.ExportMemory, calling s.Close early can stop the FC process while memory export is still running, leading to intermittent failed to export memory errors or incomplete memfile diffs when the rootfs path finishes first. This only happens with rootfs providers that close the sandbox during export, but those are in use here, so the parallelization introduces a real race.

Useful? React with 👍 / 👎.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will check this

s.config.DefaultCacheDir,
)
if rootfsErr != nil {
return fmt.Errorf("error while post processing rootfs: %w", rootfsErr)
}

rootfsDiff = diff
rootfsDiffHeader = header

cleanup.AddNoContext(egCtx, rootfsDiff.Close)

return nil
})
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Race condition: sandbox cleanup while memory export runs

The parallelization introduces a race condition. The rootfs processing goroutine passes s.Close as the closeHook to ExportDiff, which spawns a goroutine calling s.Close. This runs s.cleanup.Run() including sbx.Stop, which kills the firecracker process via s.process.Stop(). Meanwhile, the memory processing goroutine is concurrently calling s.process.ExportMemory() to read memory from the same process. Previously these ran sequentially, so memory export completed before s.Close was triggered. This is exactly the concern raised in the PR discussion by @dobrac.

Fix in Cursor Fix in Web


waitErr := eg.Wait()
if waitErr != nil {
return nil, fmt.Errorf("error while post processing: %w", waitErr)
}
cleanup.AddNoContext(ctx, rootfsDiff.Close)

metadataFileLink := template.NewLocalFileLink(snapshotTemplateFiles.CacheMetadataPath())
cleanup.AddNoContext(ctx, metadataFileLink.Close)
Expand Down
Loading