Inject Failure Demo #
This is a demo that introduces how to inject fault to initContainer in an existing minikube cluster.
Demo Detail Steps
# Demo: inject fault to initContainer in an existing minikube cluster
kubectl config use-context minikube
kubectl get nodes
# Deploy and install kwok
kubectl apply -f https://github.com/kubernetes-sigs/kwok/releases/download/v0.5.2/kwok.yaml --wait
kubectl get pod -n kube-system | grep kwok
# Deploy pod stages
kubectl apply -f https://github.com/kubernetes-sigs/kwok/releases/download/v0.5.2/stage-fast.yaml
kubectl apply -f https://github.com/kubernetes-sigs/kwok/raw/main/kustomize/stage/pod/chaos/pod-init-container-running-failed.yaml
kubectl get stages
# Add a virtual-gpu node
grep allocatable ${WORK_DIR}/virtual-gpu-node.yaml -A 15
kubectl apply -f ${WORK_DIR}/virtual-gpu-node.yaml
kubectl get nodes
kubectl describe node virtual-gpu-node | grep Allocatable -A 15
# Create a pod and inject an error to initContainer to simulate a preflight check error
grep failed ${WORK_DIR}/failed-pod.yaml
kubectl apply -f ${WORK_DIR}/failed-pod.yaml
kubectl get pod
kubectl describe pod failed-pod | grep "Init Containers" -A 10
# Clean up
kubectl delete -f ${WORK_DIR}/failed-pod.yaml
kubectl delete -f ${WORK_DIR}/virtual-gpu-node.yaml
kubectl delete -f https://github.com/kubernetes-sigs/kwok/releases/download/v0.5.2/kwok.yaml
# Thank you!
clear
virtual-gpu-node.yaml
apiVersion: v1
kind: Node
metadata:
annotations:
kwok.x-k8s.io/node: fake
labels:
beta.kubernetes.io/arch: amd64
beta.kubernetes.io/os: linux
gpu_affinity_capable: "true"
healthcolor: green
kubernetes.io/arch: amd64
kubernetes.io/hostname: virtual-dgxa100.40g-0
kubernetes.io/os: linux
kubernetes.io/role: agent
node-role.bcp.ngc.nvidia.com/role: user-workload
node-role.kubernetes.io/agent: ""
nodeGroup: gpu
nodeInstance: dgxa100.40g.8.norm
nodeType: gpu
pod_gpu_size.static: dynamic
preempt_upon_cordon: "true"
type: kwok
name: virtual-gpu-node
spec:
taints: # Avoid scheduling actual running pods to fake Node
- effect: NoSchedule
key: kwok.x-k8s.io/node
value: fake
status:
allocatable:
cpu: "256"
ephemeral-storage: 15Ti
hugepages-1Gi: "0"
hugepages-2Mi: "0"
intel.com/mlnx_sriov_rdma1: "1"
intel.com/mlnx_sriov_rdma2: "1"
intel.com/mlnx_sriov_rdma3: "1"
intel.com/mlnx_sriov_rdma4: "1"
intel.com/mlnx_sriov_rdma5: "1"
intel.com/mlnx_sriov_rdma6: "1"
intel.com/mlnx_sriov_rdma7: "1"
intel.com/mlnx_sriov_rdma8: "1"
memory: 1Ti
nvidia.com/gpu: "8"
pods: "110"
capacity:
cpu: "256"
ephemeral-storage: 15Ti
hugepages-1Gi: "0"
hugepages-2Mi: "0"
intel.com/mlnx_sriov_rdma1: "1"
intel.com/mlnx_sriov_rdma2: "1"
intel.com/mlnx_sriov_rdma3: "1"
intel.com/mlnx_sriov_rdma4: "1"
intel.com/mlnx_sriov_rdma5: "1"
intel.com/mlnx_sriov_rdma6: "1"
intel.com/mlnx_sriov_rdma7: "1"
intel.com/mlnx_sriov_rdma8: "1"
memory: 1Ti
nvidia.com/gpu: "8"
pods: "110"
conditions:
- lastHeartbeatTime: null
lastTransitionTime: null
message: Node is healthy
reason: NodeReady
status: "True"
type: AggregatedNodeHealth
- lastHeartbeatTime: null
lastTransitionTime: null
message: Flannel is running on this node
reason: FlannelIsUp
status: "False"
type: NetworkUnavailable
- lastHeartbeatTime: null
lastTransitionTime: null
message: acs kernel module is disabled
reason: ACSModuleDisabled
status: "False"
type: ACSModuleCheck
- lastHeartbeatTime: null
lastTransitionTime: null
message: kernel has no deadlock
reason: KernelHasNoDeadlock
status: "False"
type: KernelDeadlock
- lastHeartbeatTime: null
lastTransitionTime: null
message: Filesystem is read-only
reason: FilesystemIsReadOnly
status: "False"
type: ReadonlyFilesystem
- lastHeartbeatTime: null
lastTransitionTime: null
message: Flannel is running on this node
reason: FlannelDeviceAvailable
status: "False"
type: FlannelNetworkDeviceProblem
- lastHeartbeatTime: null
lastTransitionTime: null
message: Dgx has /raid
reason: DgxRaidOk
status: "False"
type: DgxRaidProblem
- lastHeartbeatTime: null
lastTransitionTime: null
message: Node is in NW Topology CM or feature disabled
reason: NodeIsAdded
status: "False"
type: NodeNotInNWTopologyCM
- lastHeartbeatTime: null
lastTransitionTime: null
message: GPU has a DBE/MSBE problem
reason: GpuHasNoDbeMsbeProblem
status: "False"
type: GpuDbeMsbeProblem
- lastHeartbeatTime: null
lastTransitionTime: null
message: ceph client is backlisted resulting in hung mounts
reason: CephClientBlackListed
status: "False"
type: CephMountsHung
- lastHeartbeatTime: null
lastTransitionTime: null
message: nv_peer_mem is loaded and active
reason: NvPeerMemKernelModuleOK
status: "False"
type: NvPeerMemProblem
- lastHeartbeatTime: null
lastTransitionTime: null
message: RoCE interface(s) spoof check is OFF (OK)
reason: RoCESpoofCheck
status: "False"
type: RoCESpoofCheckProblem
- lastHeartbeatTime: null
lastTransitionTime: null
message: GPU has HW Slowdown in Active State
reason: GpuHWSlowDownNotActive
status: "False"
type: GpuHWSlowDown
- lastHeartbeatTime: null
lastTransitionTime: null
message: RoCE interface(s) are UP
reason: RoCECarrierSignal
status: "False"
type: RoCELinksProblem
- lastHeartbeatTime: null
lastTransitionTime: null
message: kubelet has sufficient memory available
reason: KubeletHasSufficientMemory
status: "False"
type: MemoryPressure
- lastHeartbeatTime: null
lastTransitionTime: null
message: kubelet has no disk pressure
reason: KubeletHasNoDiskPressure
status: "False"
type: DiskPressure
- lastHeartbeatTime: null
lastTransitionTime: null
message: kubelet has sufficient PID available
reason: KubeletHasSufficientPID
status: "False"
type: PIDPressure
- lastHeartbeatTime: null
lastTransitionTime: null
message: kubelet is posting ready status. AppArmor enabled
reason: KubeletReady
status: "True"
type: Ready
phase: Running
failed-pod.yaml
apiVersion: v1
kind: Pod
metadata:
name: failed-pod
labels:
pod-init-container-running-failed.stage.kwok.x-k8s.io: "true"
annotations:
pod-init-container-running-failed.stage.kwok.x-k8s.io/container-name: nccl-test
pod-init-container-running-failed.stage.kwok.x-k8s.io/reason: nccl-test-failed
pod-init-container-running-failed.stage.kwok.x-k8s.io/message: "nccl test failed"
pod-init-container-running-failed.stage.kwok.x-k8s.io/exit-code: "3"
pod-init-container-running-failed.stage.kwok.x-k8s.io/delay: "1000"
pod-init-container-running-failed.stage.kwok.x-k8s.io/jitter-delay: "5000"
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: type
operator: In
values:
- kwok
# A taints was added to an automatically created Node.
# You can remove taints of Node or add this tolerations.
tolerations:
- key: "kwok.x-k8s.io/node"
operator: "Exists"
effect: "NoSchedule"
initContainers:
- image: nccl
name: nccl-test
containers:
- image: llm
name: training-job
resources:
limits:
nvidia.com/gpu: '1'
requests:
cpu: 100m
memory: 100Mi
nvidia.com/gpu: '1'