Added code for new device plugin

64c132ae · Anshul · 5d29e9d6 · 64c132ae · 64c132ae · 64c132ae
Commit 64c132ae authored May 30, 2023 by Anshul
6 changed files
--- a/device_plugin/Dockerfile
+++ b/device_plugin/Dockerfile
+FROM golang:1.20.4-alpine3.18
+WORKDIR /app
+COPY mps-device-plugin.go .
+RUN go mod init mps-device-plugin
+RUN go get google.golang.org/grpc
+RUN go get k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1
+ENTRYPOINT ["./mps-device_plugin"]
\ No newline at end of file
--- a/device_plugin/go.mod
+++ b/device_plugin/go.mod
+module mps-device-plugin
+go 1.20
+require (
+	github.com/gogo/protobuf v1.3.2 // indirect
+	github.com/golang/glog v1.1.1 // indirect
+	github.com/golang/protobuf v1.5.3 // indirect
+	golang.org/x/net v0.8.0 // indirect
+	golang.org/x/sys v0.6.0 // indirect
+	golang.org/x/text v0.8.0 // indirect
+	google.golang.org/genproto v0.0.0-20230306155012-7f2fa6fef1f4 // indirect
+	google.golang.org/grpc v1.55.0 // indirect
+	google.golang.org/protobuf v1.30.0 // indirect
+	k8s.io/kubelet v0.27.2 // indirect
+)
--- a/device_plugin/go.sum
+++ b/device_plugin/go.sum
+github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
+github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
+github.com/golang/glog v1.1.1 h1:jxpi2eWoU84wbX9iIEyAeeoac3FLuifZpY9tcNUD9kw=
+github.com/golang/glog v1.1.1/go.mod h1:zR+okUeTbrL6EL3xHUDxZuEtGv04p5shwip1+mL/rLQ=
+github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
+github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg=
+github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
+github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
+github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
+github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
+golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
+golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
+golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.8.0 h1:57P1ETyNKtuIjB4SRd15iJxuhj8Gc416Y78H3qgMh68=
+golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
+golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+google.golang.org/genproto v0.0.0-20230306155012-7f2fa6fef1f4 h1:DdoeryqhaXp1LtT/emMP1BRJPHHKFi5akj/nbx/zNTA=
+google.golang.org/genproto v0.0.0-20230306155012-7f2fa6fef1f4/go.mod h1:NWraEVixdDnqcqQ30jipen1STv2r/n24Wb7twVTGR4s=
+google.golang.org/grpc v1.55.0 h1:3Oj82/tFSCeUrRTg/5E/7d/W5A1tj6Ky1ABAuZuv5ag=
+google.golang.org/grpc v1.55.0/go.mod h1:iYEXKGkEBhg1PjZQvoYEVPTDkHo1/bjTnfwTeGONTY8=
+google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
+google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
+google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
+google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
+k8s.io/kubelet v0.27.2 h1:vpJnBkqQjxItEhehKG0toXoZ+G+tf4UXAOqtMJy6qgc=
+k8s.io/kubelet v0.27.2/go.mod h1:1SVrHaLnuw53nQJx8036k9HjE0teDXZtbN51cYC0HSc=
--- a/device_plugin/mps-device-plugin
+++ b/device_plugin/mps-device-plugin
--- a/device_plugin/mps-device-plugin.go
+++ b/device_plugin/mps-device-plugin.go
+package main
+import (
+	"context"
+	"fmt"
+	"net"
+	"strconv"
+	"github.com/golang/glog"
+	"google.golang.org/grpc"
+	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
+)
+const (
+	mpsControlBin      = "/usr/local/nvidia/bin/nvidia-cuda-mps-control"
+	mpsActiveThreadCmd = "get_default_active_thread_percentage"
+	mpsMemLimitEnv     = "CUDA_MPS_PINNED_DEVICE_MEM_LIMIT"
+	mpsThreadLimitEnv  = "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"
+	socketPlugin       = "/var/lib/kubelet/device-plugins/mps-device-plugin.sock"
+)
+var (
+	computeResourceName = "nvidia-mps.com/vcore"
+	memResourceName     = "nvidia-mps.com/vmemory"
+)
+type mpsGPUManager struct {
+	grpcServer           *grpc.Server
+	devices              map[string]pluginapi.Device
+	socket               string
+	computePartitionSize int // In utilization (eg. 1%)
+	memPartitionSize     int // In MB (eg. 256MB)
+}
+func NewMpsGPUManager(computePartitionSize, memPartitionSize int) *mpsGPUManager {
+	return &mpsGPUManager{
+		devices:              make(map[string]pluginapi.Device),
+		computePartitionSize: computePartitionSize,
+		memPartitionSize:     memPartitionSize,
+		socket:               socketPlugin,
+	}
+}
+func (mgm *mpsGPUManager) ListDevices() []*pluginapi.Device {
+	gpuMemoryAvailable := 16384 // Using static value for now
+	computeDevicesCount := 100 / mgm.computePartitionSize
+	memoryDevicesCount := gpuMemoryAvailable / mgm.memPartitionSize
+	virtualDevices := make([]*pluginapi.Device, computeDevicesCount+memoryDevicesCount)
+	for i := 0; i < computeDevicesCount; i++ {
+		virtualDeviceID := fmt.Sprintf("%s-%d", computeResourceName, i)
+		virtualDevices[i] = &pluginapi.Device{
+			ID:     virtualDeviceID,
+			Health: pluginapi.Healthy,
+		}
+	}
+	for i := 0; i < memoryDevicesCount; i++ {
+		virtualDeviceID := fmt.Sprintf("%s-%d", memResourceName, i)
+		virtualDevices[computeDevicesCount+i] = &pluginapi.Device{
+			ID:     virtualDeviceID,
+			Health: pluginapi.Healthy,
+		}
+	}
+	return virtualDevices
+}
+// func (mgm *mpsGPUManager) isMpsHealthy() error {
+// 	glog.Infof("Checking status of MPS on the node")
+// 	var out bytes.Buffer
+// 	reader, writer := io.Pipe()
+// 	defer writer.Close()
+// 	defer reader.Close()
+// 	mpsCmd := exec.Command(mpsControlBin)
+// 	mpsCmd.Stdin = reader
+// 	mpsCmd.Stdout = &out
+// 	err := mpsCmd.Start()
+// 	if err != nil {
+// 		return fmt.Errorf("failed to start NVIDIA MPS health check: %v", err)
+// 	}
+// 	writer.Write([]byte(mpsActiveThreadCmd))
+// 	writer.Close()
+// 	err = mpsCmd.Wait()
+// 	if err != nil {
+// 		return fmt.Errorf("failed to health check NVIDIA MPS: %v", err)
+// 	}
+// 	reader.Close()
+// 	glog.Infof("MPS is healthy, active thread percentage = %s", out.String())
+// 	return nil
+// }
+func (mgm *mpsGPUManager) Start() error {
+	glog.Infof("Starting MPS GPU Manager")
+	lis, err := net.Listen("unix", mgm.socket)
+	if err != nil {
+		return err
+	}
+	mgm.grpcServer = grpc.NewServer()
+	pluginapi.RegisterDevicePluginServer(mgm.grpcServer, mgm)
+	glog.Infof("MPS GPU Manager registered with the kubelet")
+	return mgm.grpcServer.Serve(lis)
+}
+func (mgm *mpsGPUManager) Stop() {
+	if mgm.grpcServer != nil {
+		mgm.grpcServer.Stop()
+	}
+	glog.Infof("MPS GPU Manager stopped")
+}
+func (mgm *mpsGPUManager) ListAndWatch(empty *pluginapi.Empty, stream pluginapi.DevicePlugin_ListAndWatchServer) error {
+	resp := new(pluginapi.ListAndWatchResponse)
+	resp.Devices = mgm.ListDevices()
+	if err := stream.Send(resp); err != nil {
+		glog.Infof("Error sending device list : %v", err)
+		return err
+	}
+	glog.Infof("Successfully sent the list of devices ...")
+	select {}
+}
+func (mgm *mpsGPUManager) Allocate(ctx context.Context, rqt *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
+	allocateResponse := &pluginapi.AllocateResponse{}
+	for _, req := range rqt.ContainerRequests {
+		containerAllocateResponse := &pluginapi.ContainerAllocateResponse{}
+		totalCompute := len(req.DevicesIDs)
+		envs := make(map[string]string)
+		envs["CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"] = strconv.Itoa(totalCompute)
+		envs["CUDA_MPS_PINNED_DEVICE_MEM_LIMIT"] = "0=2G"
+		mounts := []*pluginapi.Mount{
+			{
+				ContainerPath: "/usr/local/nvidia",
+				HostPath:      "/usr/local/nvidia",
+			},
+		}
+		containerAllocateResponse.Envs = envs
+		containerAllocateResponse.Mounts = mounts
+		allocateResponse.ContainerResponses = append(allocateResponse.ContainerResponses, containerAllocateResponse)
+		glog.Infof("Successfully allocated the devices ...")
+	}
+	return allocateResponse, nil
+}
+func (mgm *mpsGPUManager) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
+	return &pluginapi.DevicePluginOptions{
+		PreStartRequired:                false,
+		GetPreferredAllocationAvailable: false,
+	}, nil
+}
+func (mgm *mpsGPUManager) GetPreferredAllocation(ctx context.Context, rqt *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) {
+	preferredAllocateResponse := &pluginapi.PreferredAllocationResponse{}
+	for _, req := range rqt.ContainerRequests {
+		containerAllocateResponse := &pluginapi.ContainerPreferredAllocationResponse{}
+		totalDevicesToAllot := req.AllocationSize
+		for i := 0; i < len(req.MustIncludeDeviceIDs); i++ {
+			containerAllocateResponse.DeviceIDs = append(containerAllocateResponse.DeviceIDs, req.MustIncludeDeviceIDs[i])
+			totalDevicesToAllot--
+		}
+		preferredAllocateResponse.ContainerResponses = append(preferredAllocateResponse.ContainerResponses, containerAllocateResponse)
+	}
+	return preferredAllocateResponse, nil
+}
+func (mgm *mpsGPUManager) PreStartContainer(context.Context, *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
+	preStartContainerResponse := pluginapi.PreStartContainerResponse{}
+	return &preStartContainerResponse, nil
+}
+func main() {
+	mgm := NewMpsGPUManager(1, 256)
+	defer mgm.Stop()
+	if err := mgm.Start(); err != nil {
+		glog.Fatalf("Error starting the MPS GPU Manager : %v", err)
+	}
+}
--- a/device_plugin/start_cluster.sh
+++ b/device_plugin/start_cluster.sh
+#!/bin/bash
+# Setup kubernetes cluster
+sudo kubeadm reset   # Delete existing master
+rm $HOME/.kube/config
+sudo rm -rf /etc/cni/net.d
+sudo swapoff -a     # Swapoff
+sudo kubeadm init --pod-network-cidr=10.244.0.0/16  # Initialize cluster
+mkdir -p $HOME/.kube    
+sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
+sudo chown $(id -u):$(id -g) $HOME/.kube/config
+kubectl apply -f https://raw.githubusercontent.com/flannel-io/flannel/master/Documentation/kube-flannel.yml     # Use flannel for networking
+kubectl taint nodes ub-10 node-role.kubernetes.io/master-   # Allow device plugins and pods to run on master
+kubectl label node ub-10 mps-gpu-enabled=true   # Add device plugin label
+# Delete daemonset
+# kubectl delete daemonset gpu-device-plugin-daemonset
+# kubectl delete clusterrolebinding gpu-device-plugin-manager-role
+# Attach daemonset again
+# kubectl create namespace gpu-device-plugin-namespace
+kubectl create sa gpu-device-plugin-manager -n kube-system
+kubectl create clusterrolebinding gpu-device-plugin-manager-role --clusterrole=cluster-admin --serviceaccount=kube-system:gpu-device-plugin-manager
+kubectl apply -f gpu_device_plugin/mps-manager.yaml
\ No newline at end of file