GPU CUDA

Install CUDA Operator

helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
helm repo update

plane up datalayer-cuda-operator

kubectl get events -n datalayer-cuda-operator --sort-by='.lastTimestamp' -w
# 1s          Normal   Started             pod/datalayer-cuda-operator-node-feature-discovery-worker-995cf              Started container worker
# 1s          Normal   Created             pod/datalayer-cuda-operator-node-feature-discovery-worker-2pqhv              Created container worker
# TEMPORARY ==> Failed to create pod sandbox: rpc error: code = Unknown desc = failed to get sandbox runtime: no runtime for "nvidia" is configured
# 1s          Normal   Pulled              pod/datalayer-cuda-operator-node-feature-discovery-worker-2pqhv              Successfully pulled image "registry.k8s.io/nfd/node-feature-discovery:v0.14.2" in 7.22s (7.221s including waiting)

Check the availability of the Nvidia GPU Operator Pods.

kubectl get pods -n datalayer-cuda-operator -w
# NAME                                                              READY   STATUS      RESTARTS   AGE
# datalayer-cuda-operator-node-feature-discovery-gc-5f54c4b65rjhq   1/1     Running     0          2m22s
# datalayer-cuda-operator-node-feature-discovery-master-cdc55p6rl   1/1     Running     0          2m22s
# datalayer-cuda-operator-node-feature-discovery-worker-4qdwf       1/1     Running     0          2m22s
# ...
# datalayer-cuda-operator-node-feature-discovery-worker-sxstj       1/1     Running     0          2m22s
# gpu-feature-discovery-lwf4g                                       2/2     Running     0          2m5s
# gpu-operator-5857f855b4-bdlqt                                     1/1     Running     0          2m22s
# nvidia-container-packages-daemonset-7jn97                          1/1     Running     0          2m7s
# nvidia-cuda-validator-f4hm7                                       0/1     Completed   0          83s
# nvidia-dcgm-exporter-l64pw                                        1/1     Running     0          2m6s
# nvidia-device-plugin-daemonset-bnn9t                              2/2     Running     0          2m6s
# nvidia-operator-validator-hc92j                                   1/1     Running     0          2m6s

kubectl get pods -n datalayer-cuda-operator -l app=nvidia-container-packages-daemonset
# NAME                                       READY   STATUS    RESTARTS   AGE
# nvidia-container-packages-daemonset-7jn97   1/1     Running   0          2m58s

kubectl get nodes -l nvidia.com/gpu.present=true

kubectl get nodes -l xpu.datalayer.io/gpu-cuda=true -o json | jq '.items[].metadata.labels' | grep nvidia

Time Slicing

Read more about Nvidia GPU Time Slicing.

kubectl get nodes -l nvidia.com/gpu.present=true
kubectl get nodes -l nvidia.com/gpu.present=true -o json | jq '.items[].metadata.labels' | grep "nvidia.com/gpu"
kubectl get nodes -l nvidia.com/gpu.present=true -o json | jq '.items[].metadata.labels' | grep "nvidia.com/gpu.replicas"
kubectl get nodes -l xpu.datalayer.io/gpu-cuda=true -o json | jq '.items[].status.allocatable'
kubectl describe nodes -l nvidia.com/gpu.present=true
# ...
# Labels:
#                   nvidia.com/gpu.count=4
#                   nvidia.com/gpu.product=Tesla-...
#                   nvidia.com/gpu.replicas=20
# Capacity:
#   nvidia.com/gpu:        0
#   nvidia.com/gpu.shared: 20
#   ...
# Allocatable:
#   nvidia.com/gpu:        0
#   nvidia.com/gpu.shared: 20
#   ...
kubectl describe node -l nvidia.com/gpu.present=true | grep "nvidia.com/gpu"
kubectl describe node -l nvidia.com/gpu.present=true | grep "nvidia.com/gpu.replicas"

Test GPU with a Pod.

cat << EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
  name: vectoradd-cuda
spec:
  restartPolicy: OnFailure
  affinity:
    nodeAffinity:
      requiredDuringSchedulingIgnoredDuringExecution:
        nodeSelectorTerms:
        - matchExpressions:
          - key: nvidia.com/gpu.present
            operator: In
            values:
            - "true"
  containers:
  - name: cuda-vectoradd
    image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1"
    resources:
      limits:
         nvidia.com/gpu: 1
EOF
kubectl get pod -n default -w
kubectl logs vectoradd-cuda -n default
kubectl delete pod vectoradd-cuda -n default

Test GPU with a Pod and run commands from a shell.

cat << EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
  name: vectoradd-cuda
spec:
  restartPolicy: OnFailure
  affinity:
    nodeAffinity:
      requiredDuringSchedulingIgnoredDuringExecution:
        nodeSelectorTerms:
        - matchExpressions:
          - key: nvidia.com/gpu.present
            operator: In
            values:
            - "true"
  containers:
  - name: cuda-vectoradd
    image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1"
    command: ["/bin/sh"]
    args: ["-c", "while true; do echo hello; sleep 10;done"]
    resources:
      limits:
         nvidia.com/gpu: 1
EOF
kubectl get pod -n default -w
kubectl exec -it vectoradd-cuda -n default -- bash
nvidia-smi
cat /etc/lsb-release
nvcc --version
exit
kubectl delete pod vectoradd-cuda -n default

Test GPU with a Jupyter Pod.

cat << EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
  name: jupyter-test
  namespace: datalayer-jupyter
spec:
#  restartPolicy: OnFailure
  affinity:
    nodeAffinity:
      requiredDuringSchedulingIgnoredDuringExecution:
        nodeSelectorTerms:
        - matchExpressions:
          - key: nvidia.com/gpu.present
            operator: In
            values:
            - "true"
  imagePullSecrets:
        - name: reg-creds
  containers:
  - name: cuda-vectoradd
    image: ${DATALAYER_DOCKER_REGISTRY}/datalayer/jupyter-fastai-cuda:0.0.8
    command: ["/bin/sh"]
    args: ["-c", "while true; do echo hello; sleep 10;done"]
    resources:
      limits:
         nvidia.com/gpu: 1
EOF
kubectl get pod -n datalayer-jupyter
kubectl exec -it jupyter-test -n datalayer-jupyter -- bash
cat /etc/lsb-release
nvcc --version
~/cuda-samples/Samples/0_Introduction/vectorAdd/vectorAdd
python -c "import torch; print(torch.cuda.is_available())" 
nvidia-smi
exit
kubectl delete pod jupyter-test -n datalayer-jupyter

Test GPU with a Deployment.

cat << EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
  name: time-slicing-verification
  labels:
    app: time-slicing-verification
spec:
  replicas: 20
  selector:
    matchLabels:
      app: time-slicing-verification
  template:
    metadata:
      labels:
        app: time-slicing-verification
    spec:
      tolerations:
        - key: nvidia.com/gpu
          operator: Exists
          effect: NoSchedule
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: nvidia.com/gpu.present
                operator: In
                values:
                - "true"
      hostPID: true
      containers:
        - name: cuda-sample-vector-add
          image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04
          command: ["/bin/bash", "-c", "--"]
          args:
            - while true; do /cuda-samples/vectorAdd; done
          resources:
           limits:
             nvidia.com/gpu: 1
EOF
kubectl get pod -n default -w
kubectl logs deploy/time-slicing-verification -n default -f
# 
kubectl exec time-slicing-verification-5b7fcc97db-8m5gl -- nvidia-smi
Sat Nov. 11 05:46:16 2023
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  Tesla V100-PCIE-16GB           On  | 00000000:00:06.0 Off |                    0 |
| N/A   32C    P0              44W / 250W |    666MiB / 16384MiB |     29%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A    960438      C   /cuda-samples/vectorAdd                      12MiB |
|    0   N/A  N/A    960439      C   /cuda-samples/vectorAdd                      12MiB |
|    0   N/A  N/A    960442      C   /cuda-samples/vectorAdd                      12MiB |
|    0   N/A  N/A    960467      C   /cuda-samples/vectorAdd                      12MiB |
+---------------------------------------------------------------------------------------+
#
kubectl delete deployment time-slicing-verification -n default

MIG

Setup GPU with MIG, read more about MIG:

MIG
MIG mode

kubectl get pods -n datalayer-cuda-operator -w
kubectl get pods -n datalayer-cuda-operator -l app=nvidia-mig-manager

Get the list of supported configurations and pick one of them (e.g. all-1g.10gb).

kubectl describe configmap -n datalayer-cuda-operator default-mig-parted-config

Configure MIG.

kubectl get nodes -l node.datalayer.io/variant=large -l xpu.datalayer.io/gpu-cuda="true"
kubectl label nodes -l node.datalayer.io/variant=large -l xpu.datalayer.io/gpu-cuda="true" nvidia.com/mig.config=all-1g.10gb --overwrite
kubectl logs -n datalayer-cuda-operator -l app=nvidia-mig-manager
# MIG configuration applied successfully
# Restarting validator pod to re-run all validations
# pod "nvidia-operator-validator-zmpcn" deleted
# Restarting all GPU clients previously shutdown in Kubernetes by reenabling their component-specific nodeSelector labels
# node/jupyter-cuda-large-node-861557 labeled
# Changing the 'nvidia.com/mig.config.state' node label to 'success'
# node/jupyter-cuda-large-node-861557 labeled
# time="2024-04-16T11:56:36Z" level=info msg="Successfully updated to MIG config: all-1g.5gb"
# time="2024-04-16T11:56:36Z" level=info msg="Waiting for change to 'nvidia.com/mig.config' label"

kubectl get nodes -l nvidia.com/gpu.present=true --show-labels | grep "nvidia.com/gpu"
kubectl get nodes -l node.datalayer.io/variant=large -l xpu.datalayer.io/gpu-cuda="true" -o json | jq '.items[].status.allocatable'
kubectl describe node -l nvidia.com/gpu.present=true
# ...
# Labels:
#                   nvidia.com/gpu.count=4
#                   nvidia.com/gpu.product=Tesla-...  # <<< If your configuration is invalid, the value will display a text with "INVALID"
# Capacity:
#   nvidia.com/gpu:        0
#   ...
# Allocatable:
#   nvidia.com/gpu:        0
#   ...
kubectl describe node -l nvidia.com/gpu.present=true | grep "nvidia.com/gpu"

kubectl get nodes -l node.datalayer.io/variant=large -l xpu.datalayer.io/gpu-cuda="true" -o json | jq '.items[].metadata.labels'
kubectl get nodes -l node.datalayer.io/variant=large -l xpu.datalayer.io/gpu-cuda="true" -o json | jq '.items[].metadata.labels' | grep mig
# "nvidia.com/gpu.deploy.mig-manager": "true"
# "nvidia.com/mig.config": "all-1g.5gb"
# "nvidia.com/mig.config.state": "success"  # <<< Possible value: pending | rebooting | success
# "nvidia.com/mig.strategy": "single"       # <<< Possible value: single | mixed | none

You can update the MIG configuration.

kubectl label nodes -l node.datalayer.io/variant=large -l xpu.datalayer.io/gpu-cuda="true" nvidia.com/mig.config=all-3g.20gb --overwrite

You can also disable MIG.

kubectl label nodes -l node.datalayer.io/variant=large -l xpu.datalayer.io/gpu-cuda="true" nvidia.com/mig.config=all-disabled --overwrite

# Test GPU MIG with a Pod.
cat << EOF | kubectl create -f -
apiVersion: v1
kind: Pod
metadata:
  name: vectoradd-cuda
spec:
  restartPolicy: OnFailure
  containers:
  - name: vectoradd
    image: nvidia/samples:vectoradd-cuda11.2.1
    resources:
      limits:
        nvidia.com/gpu: 1
  nodeSelector:
#    nvidia.com/gpu.product: A100-SXM4-40GB-MIG-1g.5gb
    nvidia.com/gpu.present: "true"
EOF
kubectl get pod -n default -w
kubectl describe pod vectoradd-cuda -n default
kubectl logs vectoradd-cuda -n default
kubectl delete pod vectoradd-cuda -n default

Test GPU with a Deployment.

cat << EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
  name: mig-verification
  labels:
    app: mig-verification
spec:
  replicas: 20
  selector:
    matchLabels:
      app: mig-verification
  template:
    metadata:
      labels:
        app: mig-verification
    spec:
      tolerations:
        - key: nvidia.com/gpu
          operator: Exists
          effect: NoSchedule
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: nvidia.com/gpu.present
                operator: In
                values:
                - "true"
      hostPID: true
      containers:
        - name: cuda-sample-vector-add
          image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04
          command: ["/bin/bash", "-c", "--"]
          args:
            - while true; do /cuda-samples/vectorAdd; done
          resources:
           limits:
             nvidia.com/gpu: 1
EOF
kubectl get pod -n default -w
kubectl logs deploy/mig-verification -n default -f
#
kubectl exec mig-verification-5b7fcc97db-8m5gl -- nvidia-smi
Sat Nov. 11 05:46:16 2023
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  Tesla V100-PCIE-16GB           On  | 00000000:00:06.0 Off |                    0 |
| N/A   32C    P0              44W / 250W |    666MiB / 16384MiB |     29%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A    960438      C   /cuda-samples/vectorAdd                      12MiB |
|    0   N/A  N/A    960439      C   /cuda-samples/vectorAdd                      12MiB |
|    0   N/A  N/A    960442      C   /cuda-samples/vectorAdd                      12MiB |
|    0   N/A  N/A    960467      C   /cuda-samples/vectorAdd                      12MiB |
+---------------------------------------------------------------------------------------+
#
kubectl delete deployment mig-verification -n default 

Further Validation

If you can run PyTorch code, use the following to confirm the correct working and the GPU memory.

import torch
print(torch.cuda.get_device_name(0)) # eg 'NVIDIA H100 PCIe MIG 1g.10gb'
t = torch.cuda.get_device_properties(0).total_memory
print("Total GPU Memory", t)
r = torch.cuda.memory_reserved(0)
print("Reserved GPU Memory", r)
a = torch.cuda.memory_allocated(0)
print("Allocated GPU Memory", a)
f = r-a  # free inside reserved
print("Free inside Reserved GPU Memory", f)
SIZE=800 # 8000 or 80000
gpu_rand = torch.rand(SIZE, SIZE, device="cuda")
print(gpu_rand)

Tear down

If needed, tear down.

plane down datalayer-cuda-operator

Install CUDA Operator​

Time Slicing​

MIG​

Further Validation​

Tear down​

Install CUDA Operator

Time Slicing

MIG

Further Validation

Tear down