Commit cc33602f authored by Anshul's avatar Anshul

mps device plugin from nebuly

parent 49299981
version: v1
flags:
migStrategy: "none"
failOnInitError: true
nvidiaDriverRoot: "/lib/x86_64-linux-gnu"
plugin:
passDeviceSpecs: false
deviceListStrategy: envvar
deviceIDStrategy: uuid
\ No newline at end of file
{
"version": "v1",
"flags": {
"migStrategy": "none",
"failOnInitError": true,
"nvidiaDriverRoot": "/",
"gdsEnabled": false,
"mofedEnabled": false,
"plugin": {
"passDeviceSpecs": false,
"deviceListStrategy": [
"envar"
],
"deviceIDStrategy": "uuid",
"cdiAnnotationPrefix": "cdi.k8s.io/",
"nvidiaCTKPath": "/usr/bin/nvidia-ctk",
"containerDriverRoot": "/driver-root"
}
},
"resources": {
"gpus": [
{
"pattern": "*",
"name": "nvidia.com/gpu"
}
]
},
"sharing": {
"timeSlicing": {}
}
}
\ No newline at end of file
#!/bin/bash
# Setup kubernetes cluster
sudo kubeadm reset # Delete existing master
rm $HOME/.kube/config
sudo rm -rf /etc/cni/net.d
sudo swapoff -a # Swapoff
sudo kubeadm init --pod-network-cidr=10.244.0.0/16 # Initialize cluster
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
kubectl apply -f https://raw.githubusercontent.com/flannel-io/flannel/master/Documentation/kube-flannel.yml # Use flannel for networking
kubectl taint nodes ub-10 node-role.kubernetes.io/master- # Allow device plugins and pods to run on master
# Deploy nvidia daemonset
# kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.0/nvidia-device-plugin.yml
# helm install oci://ghcr.io/nebuly-ai/helm-charts/nvidia-device-plugin --version 0.13.0 --generate-name -n nebuly-nvidia --create-namespace
kubectl label node ub-10 nos.nebuly.com/gpu-partitioning=mps
helm install oci://ghcr.io/nebuly-ai/helm-charts/nvidia-device-plugin --version 0.13.0 --generate-name -n nebuly-nvidia --create-namespace --values mps-config.yaml
# helm upgrade -i nvdp nvdp/nvidia-device-plugin --version=0.14.0 --namespace nvidia-device-plugin --create-namespace --set-file config.map.config=./config.yaml
# helm upgrade -i nvdp nvdp/nvidia-device-plugin --version=0.14.0 --namespace nvidia-device-plugin --create-namespace --set-file config.map.config=./config_map.yaml
# helm upgrade -i nvdp nvdp/nvidia-device-plugin --version=0.14.0 --namespace nvidia-device-plugin --create-namespace
\ No newline at end of file
......@@ -200,21 +200,64 @@ func (ps *pluginService) Allocate(ctx context.Context, rqt *pluginapi.AllocateRe
for _, req := range rqt.ContainerRequests {
containerAllocateResponse := &pluginapi.ContainerAllocateResponse{}
totalCompute := len(req.DevicesIDs)
envs := make(map[string]string)
envs["CUDA_MPS_PIPE_DIRECTORY"] = "/tmp/nvidia-mps"
envs["CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"] = strconv.Itoa(totalCompute)
envs["CUDA_MPS_PINNED_DEVICE_MEM_LIMIT"] = "0=2G"
envs["NVIDIA_DRIVER_CAPABILITIES"] = "compute,utility"
envs["NVIDIA_RUNTIME"] = "nvidia"
envs["NVIDIA_VERSION"] = "11.7"
envs["NVIDIA_VISIBLE_DEVICES"] = "0"
containerAllocateResponse.Devices = append(containerAllocateResponse.Devices, &pluginapi.DeviceSpec{
HostPath: "/dev/nvidiactl",
ContainerPath: "/dev/nvidiactl",
Permissions: "mrw",
})
containerAllocateResponse.Devices = append(containerAllocateResponse.Devices, &pluginapi.DeviceSpec{
HostPath: "/dev/nvidia-modeset",
ContainerPath: "/dev/nvidia-modeset",
Permissions: "mrw",
})
containerAllocateResponse.Devices = append(containerAllocateResponse.Devices, &pluginapi.DeviceSpec{
HostPath: "/dev/nvidia-uvm",
ContainerPath: "/dev/nvidia-uvm",
Permissions: "mrw",
})
containerAllocateResponse.Devices = append(containerAllocateResponse.Devices, &pluginapi.DeviceSpec{
HostPath: "/dev/nvidia-uvm-tools",
ContainerPath: "/dev/nvidia-uvm-tools",
Permissions: "mrw",
})
containerAllocateResponse.Devices = append(containerAllocateResponse.Devices, &pluginapi.DeviceSpec{
HostPath: "/dev/nvidia0",
ContainerPath: "/dev/nvidia0",
Permissions: "mrw",
})
containerAllocateResponse.Devices = append(containerAllocateResponse.Devices, &pluginapi.DeviceSpec{
HostPath: "/dev/nvidia-caps/nvidia-cap1",
ContainerPath: "/dev/nvidia-caps/nvidia-cap1",
Permissions: "mrw",
})
containerAllocateResponse.Devices = append(containerAllocateResponse.Devices, &pluginapi.DeviceSpec{
HostPath: "/dev/nvidia-caps/nvidia-cap2",
ContainerPath: "/dev/nvidia-caps/nvidia-cap2",
Permissions: "mrw",
})
mounts := []*pluginapi.Mount{
{
ContainerPath: "/usr/local/nvidia",
HostPath: "/usr/local/nvidia",
},
{
ContainerPath: "/tmp/nvidia-mps",
HostPath: "/tmp/nvidia-mps",
ReadOnly: false,
},
{
ContainerPath: "/usr/local/nvidia",
HostPath: "/usr/src/nvidia-515.76",
ReadOnly: false,
},
}
log.Println("Mounted the required directories")
containerAllocateResponse.Envs = envs
containerAllocateResponse.Mounts = mounts
allocateResponse.ContainerResponses = append(allocateResponse.ContainerResponses, containerAllocateResponse)
......
......@@ -20,7 +20,7 @@ spec:
mps-gpu-enabled: "true"
containers:
- name: mps-device-plugin
image: xzaviourr/mps-device-plugin:v6.2
image: xzaviourr/mps-device-plugin:v7.3
securityContext:
privileged: true
volumeMounts:
......
FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
WORKDIR /app
COPY gpu_detection.cu .
RUN nvcc -o gpu_detection gpu_detection.cu
CMD ["./gpu_detection"]
\ No newline at end of file
sudo docker build -t xzaviourr/gpu_detection:v$1 .
sudo docker save -o gpu_detection:v$1.tar xzaviourr/gpu_detection:v$1
sudo docker load -i gpu_detection:v$1.tar
\ No newline at end of file
#include <cuda.h>
#include <iostream>
#include <cuda_runtime.h>
int main() {
int cudaVersion;
cudaError_t cudaResult = cudaRuntimeGetVersion(&cudaVersion);
if (cudaResult != cudaSuccess) {
printf("Failed to retrieve CUDA version: %s\n", cudaGetErrorString(cudaResult));
return 1;
}
printf("CUDA Version: %d.%d\n", cudaVersion / 1000, (cudaVersion % 100) / 10);
int driverVersion;
cudaResult = cudaDriverGetVersion(&driverVersion);
if (cudaResult != cudaSuccess) {
printf("Failed to retrieve NVIDIA driver version: %s\n", cudaGetErrorString(cudaResult));
return 1;
}
int major = driverVersion / 1000;
int minor = (driverVersion % 100) / 10;
printf("NVIDIA Driver Version: %d.%d\n", major, minor);
int numDevices;
cudaGetDeviceCount(&numDevices);
if (numDevices == 0) {
printf("No GPU device found. Make sure the container has GPU access.\n");
} else {
printf("%d GPU device(s) found. The container has GPU access.\n", numDevices);
}
while(1);
return 0;
}
\ No newline at end of file
apiVersion: v1
kind: Pod
metadata:
name: gpu-detection
spec:
restartPolicy: OnFailure
hostIPC: true
securityContext:
runAsUser: 1000
containers:
- name: gpu-detection
image: "xzaviourr/gpu_detection:v1.0"
imagePullPolicy: Never
resources:
requests:
nvidia.com/gpu-2gb: 1
limits:
nvidia.com/gpu-2gb: 1
# volumeMounts:
# - name: nvidia-smi
# mountPath: /app/smi
# - name: lib-x
# mountPath: /lib/x86_64-linux-gnu
# volumes:
# - name: nvidia-smi
# hostPath:
# path: /usr/bin/nvidia-smi
# - name: lib-x
# hostPath:
# path: /lib/x86_64-linux-gnu
\ No newline at end of file
apiVersion: v1
kind: Pod
metadata:
name: gpu-detection1
spec:
restartPolicy: OnFailure
containers:
- name: gpu-detection1
image: "xzaviourr/gpu_detection:v1.0"
args:
- "--runtime=nvidia"
- "--ipc=host"
- "--gpus=all"
securityContext:
privileged: true
imagePullPolicy: Never
resources:
requests:
nvidia.com/gpu: 25
limits:
nvidia.com/gpu: 25
\ No newline at end of file
......@@ -3,18 +3,19 @@ kind: Pod
metadata:
name: vect-add
spec:
hostIPC: true
restartPolicy: OnFailure
securityContext:
runAsUser: 1000
containers:
- name: vect-add
image: "xzaviourr/vect_add:v1.2"
securityContext:
privileged: true
imagePullPolicy: Never
resources:
requests:
nvidia.com/gpu: 25
nvidia.com/gpu-2gb: 4
limits:
nvidia.com/gpu: 25
nvidia.com/gpu-2gb: 4
# volumeMounts:
# - name: nvidia-driver
# mountPath: /usr/local/nvidia
......
# Plugin configuration
# Only one of "name" or "map" should ever be set for a given deployment.
# Use "name" to point to an external ConfigMap with a list of configurations, or to make the Chart
# create a ConfigMap for you if "create" is True.
# Use "map" to build an integrated ConfigMap from a set of configurations as
# part of this helm chart. An example of setting "map" might be:
# config:
# map:
# default: |-
# version: v1
# flags:
# migStrategy: none
# mig-single: |-
# version: v1
# flags:
# migStrategy: single
# mig-mixed: |-
# version: v1
# flags:
# migStrategy: mixed
config:
# ConfigMap name if pulling from an external ConfigMap
name: ""
# If true, the ConfigMap containing the plugin configuration files will be created by the Chart, initialized
# with an empty default configuration.
# Otherwise, the Chart will use the existing ConfigMap with name .Values.config.name to exist.
create: true
# Set of named configs to build an integrated ConfigMap from
map:
default: |-
version: v1
flags:
migStrategy: none
sharing:
mps:
failRequestsGreaterThanOne: true
resources:
- name: nvidia.com/gpu
rename: nvidia.com/gpu-2gb
memoryGB: 2
replicas: 8
devices: ["0"]
# Default config name within the ConfigMap
default: ""
# List of fallback strategies to attempt if no config is selected and no default is provided
fallbackStrategies: ["named" , "single"]
legacyDaemonsetAPI: null
compatWithCPUManager: null
migStrategy: null
failOnInitError: null
deviceListStrategy: null
deviceIDStrategy: null
nvidiaDriverRoot: null
gdsEnabled: null
mofedEnabled: null
fullnameOverride: ""
namespaceOverride: ""
selectorLabelsOverride: {}
allowDefaultNamespace: false
imagePullSecrets: []
image:
repository: ghcr.io/nebuly-ai/k8s-device-plugin
pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion.
tag: ""
mps:
enabled: true
# The ID of the user used to run the MPS server.
# All the containers requesting GPU resources must run as this user.
userID: 1000
image:
repository: ghcr.io/nebuly-ai/nvidia-mps-server
pullPolicy: IfNotPresent
tag: "0.0.1"
updateStrategy:
type: RollingUpdate
podAnnotations: {}
podSecurityContext: {}
securityContext: {}
resources: {}
nodeSelector:
nos.nebuly.com/gpu-partitioning: "mps"
affinity: {}
tolerations:
# This toleration is deprecated. Kept here for backward compatibility
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
- key: CriticalAddonsOnly
operator: Exists
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- key: "kubernetes.azure.com/scalesetpriority"
operator: "Equal"
value: "spot"
effect: "NoSchedule"
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
runtimeClassName: null
# Subcharts
nfd:
nameOverride: node-feature-discovery
master:
extraLabelNs:
- nvidia.com
serviceAccount:
name: node-feature-discovery
worker:
tolerations:
- key: "node-role.kubernetes.io/master"
operator: "Equal"
value: ""
effect: "NoSchedule"
- key: "nvidia.com/gpu"
operator: "Equal"
value: "present"
effect: "NoSchedule"
config:
sources:
pci:
deviceClassWhitelist:
- "02"
- "0200"
- "0207"
- "0300"
- "0302"
deviceLabelFields:
- vendor
gfd:
enabled: false
nameOverride: gpu-feature-discovery
namespaceOverride: ""
\ No newline at end of file
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-ds
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.14.0
name: nvidia-device-plugin-ctr
env:
- name: FAIL_ON_INIT_ERROR
value: "false"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment