Inject Failure Demo #

This is a demo that introduces how to inject fault to initContainer in an existing minikube cluster.

Demo Detail Steps

# Demo: inject fault to initContainer in an existing minikube cluster
kubectl config use-context minikube

kubectl get nodes

# Deploy and install kwok
kubectl apply -f https://github.com/kubernetes-sigs/kwok/releases/download/v0.5.2/kwok.yaml --wait
kubectl get pod -n kube-system | grep kwok

# Deploy pod stages
kubectl apply -f https://github.com/kubernetes-sigs/kwok/releases/download/v0.5.2/stage-fast.yaml
kubectl apply -f https://github.com/kubernetes-sigs/kwok/raw/main/kustomize/stage/pod/chaos/pod-init-container-running-failed.yaml
kubectl get stages

# Add a virtual-gpu node
grep allocatable ${WORK_DIR}/virtual-gpu-node.yaml -A 15 
kubectl apply -f ${WORK_DIR}/virtual-gpu-node.yaml
kubectl get nodes
kubectl describe node virtual-gpu-node | grep Allocatable -A 15 

# Create a pod and inject an error to initContainer to simulate a preflight check error
grep failed ${WORK_DIR}/failed-pod.yaml
kubectl apply -f ${WORK_DIR}/failed-pod.yaml
kubectl get pod
kubectl describe pod failed-pod | grep "Init Containers" -A 10

# Clean up
kubectl delete -f ${WORK_DIR}/failed-pod.yaml
kubectl delete -f ${WORK_DIR}/virtual-gpu-node.yaml
kubectl delete -f https://github.com/kubernetes-sigs/kwok/releases/download/v0.5.2/kwok.yaml

# Thank you!

clear

virtual-gpu-node.yaml

apiVersion: v1
kind: Node
metadata:
  annotations:
    kwok.x-k8s.io/node: fake
  labels:
    beta.kubernetes.io/arch: amd64
    beta.kubernetes.io/os: linux
    gpu_affinity_capable: "true"
    healthcolor: green
    kubernetes.io/arch: amd64
    kubernetes.io/hostname: virtual-dgxa100.40g-0
    kubernetes.io/os: linux
    kubernetes.io/role: agent
    node-role.bcp.ngc.nvidia.com/role: user-workload
    node-role.kubernetes.io/agent: ""
    nodeGroup: gpu
    nodeInstance: dgxa100.40g.8.norm
    nodeType: gpu
    pod_gpu_size.static: dynamic
    preempt_upon_cordon: "true"
    type: kwok
  name: virtual-gpu-node
spec:
  taints: # Avoid scheduling actual running pods to fake Node
  - effect: NoSchedule
    key: kwok.x-k8s.io/node
    value: fake
status:
  allocatable:
    cpu: "256"
    ephemeral-storage: 15Ti
    hugepages-1Gi: "0"
    hugepages-2Mi: "0"
    intel.com/mlnx_sriov_rdma1: "1"
    intel.com/mlnx_sriov_rdma2: "1"
    intel.com/mlnx_sriov_rdma3: "1"
    intel.com/mlnx_sriov_rdma4: "1"
    intel.com/mlnx_sriov_rdma5: "1"
    intel.com/mlnx_sriov_rdma6: "1"
    intel.com/mlnx_sriov_rdma7: "1"
    intel.com/mlnx_sriov_rdma8: "1"
    memory: 1Ti
    nvidia.com/gpu: "8"
    pods: "110"
  capacity:
    cpu: "256"
    ephemeral-storage: 15Ti
    hugepages-1Gi: "0"
    hugepages-2Mi: "0"
    intel.com/mlnx_sriov_rdma1: "1"
    intel.com/mlnx_sriov_rdma2: "1"
    intel.com/mlnx_sriov_rdma3: "1"
    intel.com/mlnx_sriov_rdma4: "1"
    intel.com/mlnx_sriov_rdma5: "1"
    intel.com/mlnx_sriov_rdma6: "1"
    intel.com/mlnx_sriov_rdma7: "1"
    intel.com/mlnx_sriov_rdma8: "1"
    memory: 1Ti
    nvidia.com/gpu: "8"
    pods: "110"
  conditions:
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: Node is healthy
    reason: NodeReady
    status: "True"
    type: AggregatedNodeHealth
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: Flannel is running on this node
    reason: FlannelIsUp
    status: "False"
    type: NetworkUnavailable
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: acs kernel module is disabled
    reason: ACSModuleDisabled
    status: "False"
    type: ACSModuleCheck
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: kernel has no deadlock
    reason: KernelHasNoDeadlock
    status: "False"
    type: KernelDeadlock
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: Filesystem is read-only
    reason: FilesystemIsReadOnly
    status: "False"
    type: ReadonlyFilesystem
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: Flannel is running on this node
    reason: FlannelDeviceAvailable
    status: "False"
    type: FlannelNetworkDeviceProblem
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: Dgx has /raid
    reason: DgxRaidOk
    status: "False"
    type: DgxRaidProblem
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: Node is in NW Topology CM or feature disabled
    reason: NodeIsAdded
    status: "False"
    type: NodeNotInNWTopologyCM
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: GPU has a DBE/MSBE problem
    reason: GpuHasNoDbeMsbeProblem
    status: "False"
    type: GpuDbeMsbeProblem
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: ceph client is backlisted resulting in hung mounts
    reason: CephClientBlackListed
    status: "False"
    type: CephMountsHung
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: nv_peer_mem is loaded and active
    reason: NvPeerMemKernelModuleOK
    status: "False"
    type: NvPeerMemProblem
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: RoCE interface(s) spoof check is OFF (OK)
    reason: RoCESpoofCheck
    status: "False"
    type: RoCESpoofCheckProblem
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: GPU has HW Slowdown in Active State
    reason: GpuHWSlowDownNotActive
    status: "False"
    type: GpuHWSlowDown
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: RoCE interface(s) are UP
    reason: RoCECarrierSignal
    status: "False"
    type: RoCELinksProblem
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: kubelet has sufficient memory available
    reason: KubeletHasSufficientMemory
    status: "False"
    type: MemoryPressure
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: kubelet has no disk pressure
    reason: KubeletHasNoDiskPressure
    status: "False"
    type: DiskPressure
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: kubelet has sufficient PID available
    reason: KubeletHasSufficientPID
    status: "False"
    type: PIDPressure
  - lastHeartbeatTime: null
    lastTransitionTime: null
    message: kubelet is posting ready status. AppArmor enabled
    reason: KubeletReady
    status: "True"
    type: Ready
  phase: Running

failed-pod.yaml

apiVersion: v1
kind: Pod
metadata:
  name: failed-pod
  labels:
    pod-init-container-running-failed.stage.kwok.x-k8s.io: "true"
  annotations:
    pod-init-container-running-failed.stage.kwok.x-k8s.io/container-name: nccl-test
    pod-init-container-running-failed.stage.kwok.x-k8s.io/reason: nccl-test-failed
    pod-init-container-running-failed.stage.kwok.x-k8s.io/message: "nccl test failed"
    pod-init-container-running-failed.stage.kwok.x-k8s.io/exit-code: "3"
    pod-init-container-running-failed.stage.kwok.x-k8s.io/delay: "1000"
    pod-init-container-running-failed.stage.kwok.x-k8s.io/jitter-delay: "5000"
spec:
  affinity:
    nodeAffinity:
      requiredDuringSchedulingIgnoredDuringExecution:
        nodeSelectorTerms:
        - matchExpressions:
          - key: type
            operator: In
            values:
            - kwok
            # A taints was added to an automatically created Node.
            # You can remove taints of Node or add this tolerations.
  tolerations:
  - key: "kwok.x-k8s.io/node"
    operator: "Exists"
    effect: "NoSchedule"
  initContainers:
  - image: nccl
    name: nccl-test
  containers:
  - image: llm
    name: training-job
    resources:
      limits:
        nvidia.com/gpu: '1'
      requests:
        cpu: 100m
        memory: 100Mi
        nvidia.com/gpu: '1'