52baf68d50
Signed-off-by: Michał Żyłowski <michal.zylowski@intel.com>
327 lines
11 KiB
Go
327 lines
11 KiB
Go
/*
|
|
Copyright 2016 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package statefulset
|
|
|
|
import (
|
|
"fmt"
|
|
"strconv"
|
|
|
|
"k8s.io/apimachinery/pkg/api/errors"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/runtime"
|
|
"k8s.io/kubernetes/pkg/api/v1"
|
|
apps "k8s.io/kubernetes/pkg/apis/apps/v1beta1"
|
|
"k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
|
|
"k8s.io/kubernetes/pkg/client/record"
|
|
|
|
"github.com/golang/glog"
|
|
)
|
|
|
|
// petLifeCycleEvent is used to communicate high level actions the controller
|
|
// needs to take on a given pet. It's recorded in the pcb. The recognized values
|
|
// are listed below.
|
|
type petLifeCycleEvent string
|
|
|
|
const (
|
|
syncPet petLifeCycleEvent = "sync"
|
|
deletePet petLifeCycleEvent = "delete"
|
|
// updateRetries is the number of Get/Update cycles we perform when an
|
|
// update fails.
|
|
updateRetries = 3
|
|
// StatefulSetInitAnnotation is an annotation which when set, indicates that the
|
|
// pet has finished initializing itself.
|
|
// TODO: Replace this with init container status.
|
|
StatefulSetInitAnnotation = "pod.alpha.kubernetes.io/initialized"
|
|
)
|
|
|
|
// pcb is the control block used to transmit all updates about a single pet.
|
|
// It serves as the manifest for a single pet. Users must populate the pod
|
|
// and parent fields to pass it around safely.
|
|
type pcb struct {
|
|
// pod is the desired pet pod.
|
|
pod *v1.Pod
|
|
// pvcs is a list of desired persistent volume claims for the pet pod.
|
|
pvcs []v1.PersistentVolumeClaim
|
|
// event is the lifecycle event associated with this update.
|
|
event petLifeCycleEvent
|
|
// id is the identity index of this pet.
|
|
id string
|
|
// parent is a pointer to the parent statefulset.
|
|
parent *apps.StatefulSet
|
|
}
|
|
|
|
// pvcClient is a client for managing persistent volume claims.
|
|
type pvcClient interface {
|
|
// DeletePVCs deletes the pvcs in the given pcb.
|
|
DeletePVCs(*pcb) error
|
|
// SyncPVCs creates/updates pvcs in the given pcb.
|
|
SyncPVCs(*pcb) error
|
|
}
|
|
|
|
// petSyncer syncs a single pet.
|
|
type petSyncer struct {
|
|
petClient
|
|
|
|
// blockingPet is an unhealthy pet either from this iteration or a previous
|
|
// iteration, either because it is not yet Running, or being Deleted, that
|
|
// prevents other creates/deletions.
|
|
blockingPet *pcb
|
|
}
|
|
|
|
// errUnhealthyPet is returned when a we either know for sure a pet is unhealthy,
|
|
// or don't know its state but assume it is unhealthy. It's used as a signal to the caller for further operations like updating status.replicas.
|
|
// This is not a fatal error.
|
|
type errUnhealthyPet string
|
|
|
|
func (e errUnhealthyPet) Error() string {
|
|
return string(e)
|
|
}
|
|
|
|
// Sync syncs the given pet.
|
|
func (p *petSyncer) Sync(pet *pcb) error {
|
|
if pet == nil {
|
|
return nil
|
|
}
|
|
realPet, exists, err := p.Get(pet)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// There is not constraint except quota on the number of pvcs created.
|
|
// This is done per pet so we get a working cluster ASAP, even if user
|
|
// runs out of quota.
|
|
if err := p.SyncPVCs(pet); err != nil {
|
|
return err
|
|
}
|
|
// if pet failed - we need to remove old one because of consistent naming
|
|
if exists && realPet.pod.Status.Phase == v1.PodFailed {
|
|
glog.V(2).Infof("Deleting evicted pod %v/%v", realPet.pod.Namespace, realPet.pod.Name)
|
|
if err := p.petClient.Delete(realPet); err != nil {
|
|
return err
|
|
}
|
|
} else if exists {
|
|
if !p.isHealthy(realPet.pod) {
|
|
glog.V(4).Infof("StatefulSet %v waiting on unhealthy pod %v", pet.parent.Name, realPet.pod.Name)
|
|
}
|
|
return p.Update(realPet, pet)
|
|
}
|
|
if p.blockingPet != nil {
|
|
message := errUnhealthyPet(fmt.Sprintf("Create of %v in StatefulSet %v blocked by unhealthy pod %v", pet.pod.Name, pet.parent.Name, p.blockingPet.pod.Name))
|
|
glog.V(4).Infof(message.Error())
|
|
return message
|
|
}
|
|
// This is counted as a create, even if it fails. We can't skip indices
|
|
// because some pets might allocate a special role to earlier indices.
|
|
// The returned error will force a requeue.
|
|
// TODO: What's the desired behavior if pet-0 is deleted while pet-1 is
|
|
// not yet healthy? currently pet-0 will wait till pet-1 is healthy,
|
|
// this feels safer, but might lead to deadlock.
|
|
p.blockingPet = pet
|
|
if err := p.Create(pet); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Delete deletes the given pet, if no other pet in the statefulset is blocking a
|
|
// scale event.
|
|
func (p *petSyncer) Delete(pet *pcb) error {
|
|
if pet == nil {
|
|
return nil
|
|
}
|
|
realPet, exists, err := p.Get(pet)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !exists {
|
|
return nil
|
|
}
|
|
if p.blockingPet != nil {
|
|
glog.V(4).Infof("Delete of %v in StatefulSet %v blocked by unhealthy pod %v", realPet.pod.Name, pet.parent.Name, p.blockingPet.pod.Name)
|
|
return nil
|
|
}
|
|
// This is counted as a delete, even if it fails.
|
|
// The returned error will force a requeue.
|
|
p.blockingPet = realPet
|
|
if !p.isDying(realPet.pod) {
|
|
glog.V(2).Infof("StatefulSet %v deleting pod %v/%v", pet.parent.Name, pet.pod.Namespace, pet.pod.Name)
|
|
return p.petClient.Delete(pet)
|
|
}
|
|
glog.V(4).Infof("StatefulSet %v waiting on pod %v to die in %v", pet.parent.Name, realPet.pod.Name, realPet.pod.DeletionTimestamp)
|
|
return nil
|
|
}
|
|
|
|
// petClient is a client for managing pets.
|
|
type petClient interface {
|
|
pvcClient
|
|
petHealthChecker
|
|
Delete(*pcb) error
|
|
Get(*pcb) (*pcb, bool, error)
|
|
Create(*pcb) error
|
|
Update(*pcb, *pcb) error
|
|
}
|
|
|
|
// apiServerPetClient is a statefulset aware Kubernetes client.
|
|
type apiServerPetClient struct {
|
|
c clientset.Interface
|
|
recorder record.EventRecorder
|
|
petHealthChecker
|
|
}
|
|
|
|
// Get gets the pet in the pcb from the apiserver.
|
|
func (p *apiServerPetClient) Get(pet *pcb) (*pcb, bool, error) {
|
|
ns := pet.parent.Namespace
|
|
pod, err := p.c.Core().Pods(ns).Get(pet.pod.Name, metav1.GetOptions{})
|
|
if errors.IsNotFound(err) {
|
|
return nil, false, nil
|
|
}
|
|
if err != nil {
|
|
return nil, false, err
|
|
}
|
|
realPet := *pet
|
|
realPet.pod = pod
|
|
return &realPet, true, nil
|
|
}
|
|
|
|
// Delete deletes the pet in the pcb from the apiserver.
|
|
func (p *apiServerPetClient) Delete(pet *pcb) error {
|
|
err := p.c.Core().Pods(pet.parent.Namespace).Delete(pet.pod.Name, nil)
|
|
if errors.IsNotFound(err) {
|
|
err = nil
|
|
}
|
|
p.event(pet.parent, "Delete", fmt.Sprintf("pod: %v", pet.pod.Name), err)
|
|
return err
|
|
}
|
|
|
|
// Create creates the pet in the pcb.
|
|
func (p *apiServerPetClient) Create(pet *pcb) error {
|
|
_, err := p.c.Core().Pods(pet.parent.Namespace).Create(pet.pod)
|
|
p.event(pet.parent, "Create", fmt.Sprintf("pod: %v", pet.pod.Name), err)
|
|
return err
|
|
}
|
|
|
|
// Update updates the pet in the 'pet' pcb to match the pet in the 'expectedPet' pcb.
|
|
// If the pod object of a pet which to be updated has been changed in server side, we
|
|
// will get the actual value and set pet identity before retries.
|
|
func (p *apiServerPetClient) Update(pet *pcb, expectedPet *pcb) (updateErr error) {
|
|
pc := p.c.Core().Pods(pet.parent.Namespace)
|
|
|
|
for i := 0; ; i++ {
|
|
updatePod, needsUpdate, err := copyPetID(pet, expectedPet)
|
|
if err != nil || !needsUpdate {
|
|
return err
|
|
}
|
|
glog.V(4).Infof("Resetting pod %v/%v to match StatefulSet %v spec", pet.pod.Namespace, pet.pod.Name, pet.parent.Name)
|
|
_, updateErr = pc.Update(&updatePod)
|
|
if updateErr == nil || i >= updateRetries {
|
|
return updateErr
|
|
}
|
|
getPod, getErr := pc.Get(updatePod.Name, metav1.GetOptions{})
|
|
if getErr != nil {
|
|
return getErr
|
|
}
|
|
pet.pod = getPod
|
|
}
|
|
}
|
|
|
|
// DeletePVCs should delete PVCs, when implemented.
|
|
func (p *apiServerPetClient) DeletePVCs(pet *pcb) error {
|
|
// TODO: Implement this when we delete pvcs.
|
|
return nil
|
|
}
|
|
|
|
func (p *apiServerPetClient) getPVC(pvcName, pvcNamespace string) (*v1.PersistentVolumeClaim, error) {
|
|
pvc, err := p.c.Core().PersistentVolumeClaims(pvcNamespace).Get(pvcName, metav1.GetOptions{})
|
|
return pvc, err
|
|
}
|
|
|
|
func (p *apiServerPetClient) createPVC(pvc *v1.PersistentVolumeClaim) error {
|
|
_, err := p.c.Core().PersistentVolumeClaims(pvc.Namespace).Create(pvc)
|
|
return err
|
|
}
|
|
|
|
// SyncPVCs syncs pvcs in the given pcb.
|
|
func (p *apiServerPetClient) SyncPVCs(pet *pcb) error {
|
|
errmsg := ""
|
|
// Create new claims.
|
|
for i, pvc := range pet.pvcs {
|
|
_, err := p.getPVC(pvc.Name, pet.parent.Namespace)
|
|
if err != nil {
|
|
if errors.IsNotFound(err) {
|
|
var err error
|
|
if err = p.createPVC(&pet.pvcs[i]); err != nil {
|
|
errmsg += fmt.Sprintf("Failed to create %v: %v", pvc.Name, err)
|
|
}
|
|
p.event(pet.parent, "Create", fmt.Sprintf("pvc: %v", pvc.Name), err)
|
|
} else {
|
|
errmsg += fmt.Sprintf("Error trying to get pvc %v, %v.", pvc.Name, err)
|
|
}
|
|
}
|
|
// TODO: Check resource requirements and accessmodes, update if necessary
|
|
}
|
|
if len(errmsg) != 0 {
|
|
return fmt.Errorf("%v", errmsg)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// event formats an event for the given runtime object.
|
|
func (p *apiServerPetClient) event(obj runtime.Object, reason, msg string, err error) {
|
|
if err != nil {
|
|
p.recorder.Eventf(obj, v1.EventTypeWarning, fmt.Sprintf("Failed%v", reason), fmt.Sprintf("%v, error: %v", msg, err))
|
|
} else {
|
|
p.recorder.Eventf(obj, v1.EventTypeNormal, fmt.Sprintf("Successful%v", reason), msg)
|
|
}
|
|
}
|
|
|
|
// petHealthChecker is an interface to check pet health. It makes a boolean
|
|
// decision based on the given pod.
|
|
type petHealthChecker interface {
|
|
isHealthy(*v1.Pod) bool
|
|
isDying(*v1.Pod) bool
|
|
}
|
|
|
|
// defaultPetHealthChecks does basic health checking.
|
|
// It doesn't update, probe or get the pod.
|
|
type defaultPetHealthChecker struct{}
|
|
|
|
// isHealthy returns true if the pod is ready & running. If the pod has the
|
|
// "pod.alpha.kubernetes.io/initialized" annotation set to "false", pod state is ignored.
|
|
func (d *defaultPetHealthChecker) isHealthy(pod *v1.Pod) bool {
|
|
if pod == nil || pod.Status.Phase != v1.PodRunning {
|
|
return false
|
|
}
|
|
podReady := v1.IsPodReady(pod)
|
|
|
|
// User may have specified a pod readiness override through a debug annotation.
|
|
initialized, ok := pod.Annotations[StatefulSetInitAnnotation]
|
|
if ok {
|
|
if initAnnotation, err := strconv.ParseBool(initialized); err != nil {
|
|
glog.V(4).Infof("Failed to parse %v annotation on pod %v: %v", StatefulSetInitAnnotation, pod.Name, err)
|
|
} else if !initAnnotation {
|
|
glog.V(4).Infof("StatefulSet pod %v waiting on annotation %v", pod.Name, StatefulSetInitAnnotation)
|
|
podReady = initAnnotation
|
|
}
|
|
}
|
|
return podReady
|
|
}
|
|
|
|
// isDying returns true if the pod has a non-nil deletion timestamp. Since the
|
|
// timestamp can only decrease, once this method returns true for a given pet, it
|
|
// will never return false.
|
|
func (d *defaultPetHealthChecker) isDying(pod *v1.Pod) bool {
|
|
return pod != nil && pod.DeletionTimestamp != nil
|
|
}
|