crio: Ensure container state is stopped when calling StopContainer()

CRI-O works well with runc when stopping a container because as soon
as the container process returns, it can consider every container
resources such as its rootfs as being freed, and it can proceed
further by unmounting it.

But in case of virtualized runtime such as Clear Containers or Kata
Containers, the same rootfs is being mounted into the VM, usually as
a device being hotplugged. This means the runtime will need to be
triggered after the container process has returned. Particularly,
such runtimes should expect a call into "state" in order to realize
the container process is not running anymore, and it would trigger
the container to be officially stopped, proceeding to the necessary
unmounts.

The way this can be done from CRI-O, without impacting the case of
runc, is to explicitly wait for the container status to be updated
into "stopped" after the container process has returned. This way
CRI-O will call into "state" as long as it cannot see the container
status being updated properly, generating an error after a timeout.

Both PollUpdateStatusStopped() and WaitContainerStateStopped() make
use of go routines in order to support a timeout definition. They
follow the waitContainerStop() approach with chControl.

Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
This commit is contained in:
Sebastien Boeuf 2018-02-12 21:59:14 -08:00
parent a5c3e05f9f
commit 1391c5c2fd
4 changed files with 64 additions and 2 deletions

View file

@ -24,6 +24,9 @@ func (c *ContainerServer) ContainerStop(ctx context.Context, container string, t
if err := c.runtime.StopContainer(ctx, ctr, timeout); err != nil {
return "", errors.Wrapf(err, "failed to stop container %s", ctrID)
}
if err := c.runtime.WaitContainerStateStopped(ctx, ctr, timeout); err != nil {
return "", errors.Wrapf(err, "failed to get container 'stopped' status %s", ctrID)
}
if err := c.storageRuntimeServer.StopContainer(ctrID); err != nil {
return "", errors.Wrapf(err, "failed to unmount container %s", ctrID)
}

View file

@ -604,6 +604,56 @@ func waitContainerStop(ctx context.Context, c *Container, timeout time.Duration)
return nil
}
// WaitContainerStateStopped runs a loop polling UpdateStatus(), seeking for
// the container status to be updated to 'stopped'. Either it gets the expected
// status and returns nil, or it reaches the timeout and returns an error.
func (r *Runtime) WaitContainerStateStopped(ctx context.Context, c *Container, timeout int64) (err error) {
// No need to go further and spawn the go routine if the container
// is already in the expected status.
if r.ContainerStatus(c).Status == ContainerStateStopped {
return nil
}
done := make(chan error)
chControl := make(chan struct{})
go func() {
for {
select {
case <-chControl:
return
default:
// Check if the container is stopped
if err := r.UpdateStatus(c); err != nil {
done <- err
close(done)
return
}
if r.ContainerStatus(c).Status == ContainerStateStopped {
close(done)
return
}
time.Sleep(100 * time.Millisecond)
}
}
}()
select {
case err = <-done:
break
case <-ctx.Done():
close(chControl)
return ctx.Err()
case <-time.After(time.Duration(timeout) * time.Second):
close(chControl)
return fmt.Errorf("failed to get container stopped status: %ds timeout reached", timeout)
}
if err != nil {
return fmt.Errorf("failed to get container stopped status: %v", err)
}
return nil
}
// StopContainer stops a container. Timeout is given in seconds.
func (r *Runtime) StopContainer(ctx context.Context, c *Container, timeout int64) error {
c.opLock.Lock()
@ -655,6 +705,7 @@ func (r *Runtime) SetStartFailed(c *Container, err error) {
func (r *Runtime) UpdateStatus(c *Container) error {
c.opLock.Lock()
defer c.opLock.Unlock()
out, err := exec.Command(r.Path(c), "state", c.id).Output()
if err != nil {
// there are many code paths that could lead to have a bad state in the

View file

@ -48,10 +48,14 @@ func (s *Server) RemovePodSandbox(ctx context.Context, req *pb.RemovePodSandboxR
if !sb.Stopped() {
cState := s.Runtime().ContainerStatus(c)
if cState.Status == oci.ContainerStateCreated || cState.Status == oci.ContainerStateRunning {
if err := s.Runtime().StopContainer(ctx, c, 10); err != nil {
timeout := int64(10)
if err := s.Runtime().StopContainer(ctx, c, timeout); err != nil {
// Assume container is already stopped
logrus.Warnf("failed to stop container %s: %v", c.Name(), err)
}
if err := s.Runtime().WaitContainerStateStopped(ctx, c, timeout); err != nil {
return nil, fmt.Errorf("failed to get container 'stopped' status %s in pod sandbox %s: %v", c.Name(), sb.ID(), err)
}
}
}

View file

@ -56,9 +56,13 @@ func (s *Server) StopPodSandbox(ctx context.Context, req *pb.StopPodSandboxReque
for _, c := range containers {
cStatus := s.Runtime().ContainerStatus(c)
if cStatus.Status != oci.ContainerStateStopped {
if err := s.Runtime().StopContainer(ctx, c, 10); err != nil {
timeout := int64(10)
if err := s.Runtime().StopContainer(ctx, c, timeout); err != nil {
return nil, fmt.Errorf("failed to stop container %s in pod sandbox %s: %v", c.Name(), sb.ID(), err)
}
if err := s.Runtime().WaitContainerStateStopped(ctx, c, timeout); err != nil {
return nil, fmt.Errorf("failed to get container 'stopped' status %s in pod sandbox %s: %v", c.Name(), sb.ID(), err)
}
if c.ID() == podInfraContainer.ID() {
continue
}