Skip to main content

GPU CUDA

Install CUDA Operator

helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
helm repo update
plane up datalayer-cuda-operator
kubectl get events -n datalayer-cuda-operator --sort-by='.lastTimestamp' -w
# 1s Normal Started pod/datalayer-cuda-operator-node-feature-discovery-worker-995cf Started container worker
# 1s Normal Created pod/datalayer-cuda-operator-node-feature-discovery-worker-2pqhv Created container worker
# TEMPORARY ==> Failed to create pod sandbox: rpc error: code = Unknown desc = failed to get sandbox runtime: no runtime for "nvidia" is configured
# 1s Normal Pulled pod/datalayer-cuda-operator-node-feature-discovery-worker-2pqhv Successfully pulled image "registry.k8s.io/nfd/node-feature-discovery:v0.14.2" in 7.22s (7.221s including waiting)

Check the availability of the Nvidia GPU Operator Pods.

kubectl get pods -n datalayer-cuda-operator -w
# NAME READY STATUS RESTARTS AGE
# datalayer-cuda-operator-node-feature-discovery-gc-5f54c4b65rjhq 1/1 Running 0 2m22s
# datalayer-cuda-operator-node-feature-discovery-master-cdc55p6rl 1/1 Running 0 2m22s
# datalayer-cuda-operator-node-feature-discovery-worker-4qdwf 1/1 Running 0 2m22s
# ...
# datalayer-cuda-operator-node-feature-discovery-worker-sxstj 1/1 Running 0 2m22s
# gpu-feature-discovery-lwf4g 2/2 Running 0 2m5s
# gpu-operator-5857f855b4-bdlqt 1/1 Running 0 2m22s
# nvidia-container-toolkit-daemonset-7jn97 1/1 Running 0 2m7s
# nvidia-cuda-validator-f4hm7 0/1 Completed 0 83s
# nvidia-dcgm-exporter-l64pw 1/1 Running 0 2m6s
# nvidia-device-plugin-daemonset-bnn9t 2/2 Running 0 2m6s
# nvidia-operator-validator-hc92j 1/1 Running 0 2m6s
kubectl get pods -n datalayer-cuda-operator -l app=nvidia-container-toolkit-daemonset
# NAME READY STATUS RESTARTS AGE
# nvidia-container-toolkit-daemonset-7jn97 1/1 Running 0 2m58s
kubectl get nodes -l nvidia.com/gpu.present=true
kubectl get nodes -l xpu.datalayer.io/gpu-cuda=true -o json | jq '.items[].metadata.labels' | grep nvidia

Time Slicing

Read more about Nvidia GPU Time Slicing.

kubectl get nodes -l nvidia.com/gpu.present=true
kubectl get nodes -l nvidia.com/gpu.present=true -o json | jq '.items[].metadata.labels' | grep "nvidia.com/gpu"
kubectl get nodes -l nvidia.com/gpu.present=true -o json | jq '.items[].metadata.labels' | grep "nvidia.com/gpu.replicas"
kubectl get nodes -l xpu.datalayer.io/gpu-cuda=true -o json | jq '.items[].status.allocatable'
kubectl describe nodes -l nvidia.com/gpu.present=true
# ...
# Labels:
# nvidia.com/gpu.count=4
# nvidia.com/gpu.product=Tesla-...
# nvidia.com/gpu.replicas=20
# Capacity:
# nvidia.com/gpu: 0
# nvidia.com/gpu.shared: 20
# ...
# Allocatable:
# nvidia.com/gpu: 0
# nvidia.com/gpu.shared: 20
# ...
kubectl describe node -l nvidia.com/gpu.present=true | grep "nvidia.com/gpu"
kubectl describe node -l nvidia.com/gpu.present=true | grep "nvidia.com/gpu.replicas"

Test GPU with a Pod.

cat << EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: vectoradd-cuda
spec:
restartPolicy: OnFailure
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.present
operator: In
values:
- "true"
containers:
- name: cuda-vectoradd
image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1"
resources:
limits:
nvidia.com/gpu: 1
EOF
kubectl get pod -n default -w
kubectl logs vectoradd-cuda -n default
kubectl delete pod vectoradd-cuda -n default

Test GPU with a Pod and run commands from a shell.

cat << EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: vectoradd-cuda
spec:
restartPolicy: OnFailure
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.present
operator: In
values:
- "true"
containers:
- name: cuda-vectoradd
image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1"
command: ["/bin/sh"]
args: ["-c", "while true; do echo hello; sleep 10;done"]
resources:
limits:
nvidia.com/gpu: 1
EOF
kubectl get pod -n default -w
kubectl exec -it vectoradd-cuda -n default -- bash
nvidia-smi
cat /etc/lsb-release
nvcc --version
exit
kubectl delete pod vectoradd-cuda -n default

Test GPU with a Jupyter Pod.

cat << EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: jupyter-test
namespace: datalayer-jupyter
spec:
# restartPolicy: OnFailure
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.present
operator: In
values:
- "true"
imagePullSecrets:
- name: reg-creds
containers:
- name: cuda-vectoradd
image: ${DATALAYER_DOCKER_REGISTRY}/datalayer/jupyter-fastai-cuda:0.0.8
command: ["/bin/sh"]
args: ["-c", "while true; do echo hello; sleep 10;done"]
resources:
limits:
nvidia.com/gpu: 1
EOF
kubectl get pod -n datalayer-jupyter
kubectl exec -it jupyter-test -n datalayer-jupyter -- bash
cat /etc/lsb-release
nvcc --version
~/cuda-samples/Samples/0_Introduction/vectorAdd/vectorAdd
python -c "import torch; print(torch.cuda.is_available())"
nvidia-smi
exit
kubectl delete pod jupyter-test -n datalayer-jupyter

Test GPU with a Deployment.

cat << EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: time-slicing-verification
labels:
app: time-slicing-verification
spec:
replicas: 20
selector:
matchLabels:
app: time-slicing-verification
template:
metadata:
labels:
app: time-slicing-verification
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.present
operator: In
values:
- "true"
hostPID: true
containers:
- name: cuda-sample-vector-add
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04
command: ["/bin/bash", "-c", "--"]
args:
- while true; do /cuda-samples/vectorAdd; done
resources:
limits:
nvidia.com/gpu: 1
EOF
kubectl get pod -n default -w
kubectl logs deploy/time-slicing-verification -n default -f
#
kubectl exec time-slicing-verification-5b7fcc97db-8m5gl -- nvidia-smi
Sat Nov. 11 05:46:16 2023
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12 Driver Version: 535.104.12 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 Tesla V100-PCIE-16GB On | 00000000:00:06.0 Off | 0 |
| N/A 32C P0 44W / 250W | 666MiB / 16384MiB | 29% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 960438 C /cuda-samples/vectorAdd 12MiB |
| 0 N/A N/A 960439 C /cuda-samples/vectorAdd 12MiB |
| 0 N/A N/A 960442 C /cuda-samples/vectorAdd 12MiB |
| 0 N/A N/A 960467 C /cuda-samples/vectorAdd 12MiB |
+---------------------------------------------------------------------------------------+
#
kubectl delete deployment time-slicing-verification -n default

MIG

Setup GPU with MIG, read more about MIG:

kubectl get pods -n datalayer-cuda-operator -w
kubectl get pods -n datalayer-cuda-operator -l app=nvidia-mig-manager

Get the list of supported configurations and pick one of them (e.g. all-1g.10gb).

kubectl describe configmap -n datalayer-cuda-operator default-mig-parted-config

Configure MIG.

kubectl get nodes -l node.datalayer.io/variant=large -l xpu.datalayer.io/gpu-cuda="true"
kubectl label nodes -l node.datalayer.io/variant=large -l xpu.datalayer.io/gpu-cuda="true" nvidia.com/mig.config=all-1g.10gb --overwrite
kubectl logs -n datalayer-cuda-operator -l app=nvidia-mig-manager
# MIG configuration applied successfully
# Restarting validator pod to re-run all validations
# pod "nvidia-operator-validator-zmpcn" deleted
# Restarting all GPU clients previously shutdown in Kubernetes by reenabling their component-specific nodeSelector labels
# node/jupyter-cuda-large-node-861557 labeled
# Changing the 'nvidia.com/mig.config.state' node label to 'success'
# node/jupyter-cuda-large-node-861557 labeled
# time="2024-04-16T11:56:36Z" level=info msg="Successfully updated to MIG config: all-1g.5gb"
# time="2024-04-16T11:56:36Z" level=info msg="Waiting for change to 'nvidia.com/mig.config' label"
kubectl get nodes -l nvidia.com/gpu.present=true --show-labels | grep "nvidia.com/gpu"
kubectl get nodes -l node.datalayer.io/variant=large -l xpu.datalayer.io/gpu-cuda="true" -o json | jq '.items[].status.allocatable'
kubectl describe node -l nvidia.com/gpu.present=true
# ...
# Labels:
# nvidia.com/gpu.count=4
# nvidia.com/gpu.product=Tesla-... # <<< If your configuration is invalid, the value will display a text with "INVALID"
# Capacity:
# nvidia.com/gpu: 0
# ...
# Allocatable:
# nvidia.com/gpu: 0
# ...
kubectl describe node -l nvidia.com/gpu.present=true | grep "nvidia.com/gpu"
kubectl get nodes -l node.datalayer.io/variant=large -l xpu.datalayer.io/gpu-cuda="true" -o json | jq '.items[].metadata.labels'
kubectl get nodes -l node.datalayer.io/variant=large -l xpu.datalayer.io/gpu-cuda="true" -o json | jq '.items[].metadata.labels' | grep mig
# "nvidia.com/gpu.deploy.mig-manager": "true"
# "nvidia.com/mig.config": "all-1g.5gb"
# "nvidia.com/mig.config.state": "success" # <<< Possible value: pending | rebooting | success
# "nvidia.com/mig.strategy": "single" # <<< Possible value: single | mixed | none

You can update the MIG configuration.

kubectl label nodes -l node.datalayer.io/variant=large -l xpu.datalayer.io/gpu-cuda="true" nvidia.com/mig.config=all-3g.20gb --overwrite

You can also disable MIG.

kubectl label nodes -l node.datalayer.io/variant=large -l xpu.datalayer.io/gpu-cuda="true" nvidia.com/mig.config=all-disabled --overwrite
# Test GPU MIG with a Pod.
cat << EOF | kubectl create -f -
apiVersion: v1
kind: Pod
metadata:
name: vectoradd-cuda
spec:
restartPolicy: OnFailure
containers:
- name: vectoradd
image: nvidia/samples:vectoradd-cuda11.2.1
resources:
limits:
nvidia.com/gpu: 1
nodeSelector:
# nvidia.com/gpu.product: A100-SXM4-40GB-MIG-1g.5gb
nvidia.com/gpu.present: "true"
EOF
kubectl get pod -n default -w
kubectl describe pod vectoradd-cuda -n default
kubectl logs vectoradd-cuda -n default
kubectl delete pod vectoradd-cuda -n default

Test GPU with a Deployment.

cat << EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: mig-verification
labels:
app: mig-verification
spec:
replicas: 20
selector:
matchLabels:
app: mig-verification
template:
metadata:
labels:
app: mig-verification
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.present
operator: In
values:
- "true"
hostPID: true
containers:
- name: cuda-sample-vector-add
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04
command: ["/bin/bash", "-c", "--"]
args:
- while true; do /cuda-samples/vectorAdd; done
resources:
limits:
nvidia.com/gpu: 1
EOF
kubectl get pod -n default -w
kubectl logs deploy/mig-verification -n default -f
#
kubectl exec mig-verification-5b7fcc97db-8m5gl -- nvidia-smi
Sat Nov. 11 05:46:16 2023
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12 Driver Version: 535.104.12 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 Tesla V100-PCIE-16GB On | 00000000:00:06.0 Off | 0 |
| N/A 32C P0 44W / 250W | 666MiB / 16384MiB | 29% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 960438 C /cuda-samples/vectorAdd 12MiB |
| 0 N/A N/A 960439 C /cuda-samples/vectorAdd 12MiB |
| 0 N/A N/A 960442 C /cuda-samples/vectorAdd 12MiB |
| 0 N/A N/A 960467 C /cuda-samples/vectorAdd 12MiB |
+---------------------------------------------------------------------------------------+
#
kubectl delete deployment mig-verification -n default

Further Validation

If you can run PyTorch code, use the following to confirm the correct working and the GPU memory.

import torch
print(torch.cuda.get_device_name(0)) # eg 'NVIDIA H100 PCIe MIG 1g.10gb'
t = torch.cuda.get_device_properties(0).total_memory
print("Total GPU Memory", t)
r = torch.cuda.memory_reserved(0)
print("Reserved GPU Memory", r)
a = torch.cuda.memory_allocated(0)
print("Allocated GPU Memory", a)
f = r-a # free inside reserved
print("Free inside Reserved GPU Memory", f)
SIZE=800 # 8000 or 80000
gpu_rand = torch.rand(SIZE, SIZE, device="cuda")
print(gpu_rand)

Tear down

If needed, tear down.

plane down datalayer-cuda-operator