initial commit

f4ffa5f7 · Anshul · f4ffa5f7 · f4ffa5f7 · f4ffa5f7 · f4ffa5f7
Commit f4ffa5f7 authored May 29, 2023 by Anshul
17 changed files
--- a/deploy_device_plugin.yaml
+++ b/deploy_device_plugin.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: device-plugin
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gpu-device-plugin
+  template:
+    metadata: 
+      labels:
+        app: gpu-device-plugin
+    spec:
+      nodeSelector:
+        mps-gpu-enabled: "true"
+      containers:
+        - name: gpu-device-plugin
+          image: xzaviourr/gpu-device-plugin:v2
+          securityContext:
+            privileged: true
+          volumeMounts:
+            - name: device-plugin
+              mountPath: /var/lib/kubelet/device-plugins/
+      volumes:
+        - name: device-plugin
+          hostPath:
+            path: /var/lib/kubelet/device-plugins/
\ No newline at end of file
--- a/gpu_device_plugin/Dockerfile
+++ b/gpu_device_plugin/Dockerfile
+FROM golang:1.20.4-alpine3.18
+
+WORKDIR /app
+
+COPY device_plugin.go .
+
+RUN go mod init device-plugin
+RUN go get google.golang.org/grpc
+RUN go get k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1
+RUN go build device_plugin.go
+
+ENTRYPOINT ["./device_plugin"]
\ No newline at end of file
--- a/gpu_device_plugin/device_plugin
+++ b/gpu_device_plugin/device_plugin
--- a/gpu_device_plugin/device_plugin.go
+++ b/gpu_device_plugin/device_plugin.go
+package main
+
+import (
+	"context"
+	"log"
+	"net"
+	"strconv"
+
+	"google.golang.org/grpc"
+	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
+)
+
+type GPUDevicePlugin struct {
+	socket string
+	server *grpc.Server
+}
+
+func newGPUDevicePlugin(socket string) *GPUDevicePlugin {
+	return &GPUDevicePlugin{
+		socket: socket,
+	}
+}
+
+func (devicePlugin *GPUDevicePlugin) Start() error {
+	lis, err := net.Listen("unix", devicePlugin.socket)
+	if err != nil {
+		return err
+	}
+	devicePlugin.server = grpc.NewServer()
+	pluginapi.RegisterDevicePluginServer(devicePlugin.server, devicePlugin)
+	log.Println("Starting GPU Device plugin server ...")
+	return devicePlugin.server.Serve(lis)
+}
+
+func (devicePlugin *GPUDevicePlugin) Stop() {
+	if devicePlugin.server != nil {
+		devicePlugin.server.Stop()
+	}
+}
+
+func (devicePlugin *GPUDevicePlugin) ListAndWatch(empty *pluginapi.Empty, stream pluginapi.DevicePlugin_ListAndWatchServer) error {
+	resp := new(pluginapi.ListAndWatchResponse)
+	devices := make([]*pluginapi.Device, 104)
+	for i := 0; i < 100; i++ {
+		devices[i] = &pluginapi.Device{
+			ID:     "mps.iitb/gpu-vcore-" + strconv.Itoa(i),
+			Health: pluginapi.Healthy,
+		}
+	}
+	for i := 0; i < 4; i++ {
+		devices[i] = &pluginapi.Device{
+			ID:     "mps.iitb/gpu-vmemory-" + strconv.Itoa(i),
+			Health: pluginapi.Healthy,
+		}
+	}
+	resp.Devices = devices
+	if err := stream.Send(resp); err != nil {
+		log.Printf("Error sending device list : %v", err)
+		return err
+	}
+	log.Printf("Successfully sent the list of devices ...")
+	select {}
+}
+
+func (devicePlugin *GPUDevicePlugin) Allocate(ctx context.Context, rqt *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
+	allocateResponse := &pluginapi.AllocateResponse{}
+
+	for _, req := range rqt.ContainerRequests {
+		containerAllocateResponse := &pluginapi.ContainerAllocateResponse{}
+		totalCompute := len(req.DevicesIDs)
+		envs := make(map[string]string)
+		envs["CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"] = strconv.Itoa(totalCompute)
+		envs["CUDA_MPS_PINNED_DEVICE_MEM_LIMIT"] = "0=2G"
+		mounts := []*pluginapi.Mount{
+			{
+				ContainerPath: "/usr/local/nvidia",
+				HostPath:      "/usr/local/nvidia",
+			},
+		}
+		containerAllocateResponse.Envs = envs
+		containerAllocateResponse.Mounts = mounts
+		allocateResponse.ContainerResponses = append(allocateResponse.ContainerResponses, containerAllocateResponse)
+		log.Printf("Successfully allocated the devices ...")
+	}
+	return allocateResponse, nil
+}
+
+func (devicePlugin *GPUDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
+	return &pluginapi.DevicePluginOptions{
+		PreStartRequired:                false,
+		GetPreferredAllocationAvailable: false,
+	}, nil
+}
+
+func (devicePlugin *GPUDevicePlugin) GetPreferredAllocation(ctx context.Context, rqt *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) {
+	preferredAllocateResponse := &pluginapi.PreferredAllocationResponse{}
+
+	for _, req := range rqt.ContainerRequests {
+		containerAllocateResponse := &pluginapi.ContainerPreferredAllocationResponse{}
+		totalDevicesToAllot := req.AllocationSize
+		for i := 0; i < len(req.MustIncludeDeviceIDs); i++ {
+			containerAllocateResponse.DeviceIDs = append(containerAllocateResponse.DeviceIDs, req.MustIncludeDeviceIDs[i])
+			totalDevicesToAllot--
+		}
+		preferredAllocateResponse.ContainerResponses = append(preferredAllocateResponse.ContainerResponses, containerAllocateResponse)
+	}
+	return preferredAllocateResponse, nil
+}
+
+func (devicePlugin *GPUDevicePlugin) PreStartContainer(context.Context, *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
+	preStartContainerResponse := pluginapi.PreStartContainerResponse{}
+	return &preStartContainerResponse, nil
+}
+
+func main() {
+	socketPlugin := "/var/lib/kubelet/device-plugins/gpu-device-plugin.sock"
+	devicePlugin := newGPUDevicePlugin(socketPlugin)
+	defer devicePlugin.Stop()
+
+	if err := devicePlugin.Start(); err != nil {
+		log.Fatalf("Error starting the device plugin : %v", err)
+	}
+}
--- a/gpu_device_plugin/go.mod
+++ b/gpu_device_plugin/go.mod
+module device-plugin
+
+go 1.20
+
+require (
+	github.com/gogo/protobuf v1.3.2 // indirect
+	github.com/golang/protobuf v1.5.3 // indirect
+	golang.org/x/net v0.10.0 // indirect
+	golang.org/x/sys v0.8.0 // indirect
+	golang.org/x/text v0.9.0 // indirect
+	google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect
+	google.golang.org/grpc v1.55.0 // indirect
+	google.golang.org/protobuf v1.30.0 // indirect
+	k8s.io/kubelet v0.27.2 // indirect
+	k8s.io/kubernetes v1.15.0-alpha.0 // indirect
+)
--- a/gpu_device_plugin/go.sum
+++ b/gpu_device_plugin/go.sum
+github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
+github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
+github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
+github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg=
+github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
+github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
+github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
+github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
+golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M=
+golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU=
+golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE=
+golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
+golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 h1:KpwkzHKEF7B9Zxg18WzOa7djJ+Ha5DzthMyZYQfEn2A=
+google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU=
+google.golang.org/grpc v1.55.0 h1:3Oj82/tFSCeUrRTg/5E/7d/W5A1tj6Ky1ABAuZuv5ag=
+google.golang.org/grpc v1.55.0/go.mod h1:iYEXKGkEBhg1PjZQvoYEVPTDkHo1/bjTnfwTeGONTY8=
+google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
+google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
+google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
+google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
+k8s.io/kubelet v0.27.2 h1:vpJnBkqQjxItEhehKG0toXoZ+G+tf4UXAOqtMJy6qgc=
+k8s.io/kubelet v0.27.2/go.mod h1:1SVrHaLnuw53nQJx8036k9HjE0teDXZtbN51cYC0HSc=
+k8s.io/kubernetes v1.15.0-alpha.0 h1:gIwzzIgt0BX8fznDZRxdZv2837cEmVetI54aUfzkPk8=
+k8s.io/kubernetes v1.15.0-alpha.0/go.mod h1:ocZa8+6APFNC2tX1DZASIbocyYT5jHzqFVsY5aoB7Jk=
+k8s.io/kubernetes v1.27.2 h1:g4v9oY6u7vBUDEuq4FvC50Bbw2K7GZuvM00IIESWVf4=
+k8s.io/kubernetes v1.27.2/go.mod h1:U8ZXeKBAPxeb4J4/HOaxjw1A9K6WfSH+fY2SS7CR6IM=
--- a/gpu_device_plugin/mps-manager.yaml
+++ b/gpu_device_plugin/mps-manager.yaml
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: gpu-device-plugin-daemonset
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      app: gpu-device-plugin
+  template:
+    metadata:
+      labels:
+        app: gpu-device-plugin
+    spec:
+      hostPID: true
+      hostIPC: true
+      hostNetwork: true
+      serviceAccount: gpu-device-plugin-manager
+      nodeSelector:
+        mps-gpu-enabled: "true"
+      containers:
+        - name: gpu-device-plugin
+          image: xzaviourr/gpu-device-plugin:v4
+          volumeMounts:
+            - name: device-plugin
+              mountPath: /var/lib/kubelet/device-plugins
+      volumes:
+        - name: device-plugin
+          hostPath:
+            path: /var/lib/kubelet/device-plugins
+    
--- a/gpu_scheduler_extender/Dockerfile
+++ b/gpu_scheduler_extender/Dockerfile
+FROM golang:1.20.4-alpine3.18
+
+WORKDIR /app
+ENV KUBECONFIG=/root/.kube/config
+COPY scheduler_ext.go .
+
+RUN go mod init scheduler_ext
+RUN go get google.golang.org/grpc
+RUN go get k8s.io/api/core/v1
+RUN go get k8s.io/apimachinery/pkg/apis/meta/v1
+RUN go get k8s.io/apimachinery/pkg/util/wait
+RUN go get k8s.io/client-go/kubernetes
+RUN go get k8s.io/client-go/tools/cache
+RUN go get k8s.io/client-go/tools/clientcmd
+RUN go get k8s.io/klog
+RUN go get k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1
+
+RUN go build scheduler_ext.go
+EXPOSE 8080
+
+ENTRYPOINT ["./scheduler_ext"]
\ No newline at end of file
--- a/gpu_scheduler_extender/go.mod
+++ b/gpu_scheduler_extender/go.mod
+module scheduler-ext
+
+go 1.20
+
+require (
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/emicklei/go-restful/v3 v3.9.0 // indirect
+	github.com/go-logr/logr v1.2.3 // indirect
+	github.com/go-openapi/jsonpointer v0.19.6 // indirect
+	github.com/go-openapi/jsonreference v0.20.1 // indirect
+	github.com/go-openapi/swag v0.22.3 // indirect
+	github.com/gogo/protobuf v1.3.2 // indirect
+	github.com/golang/protobuf v1.5.3 // indirect
+	github.com/google/gnostic v0.5.7-v3refs // indirect
+	github.com/google/go-cmp v0.5.9 // indirect
+	github.com/google/gofuzz v1.1.0 // indirect
+	github.com/google/uuid v1.3.0 // indirect
+	github.com/imdario/mergo v0.3.6 // indirect
+	github.com/josharian/intern v1.0.0 // indirect
+	github.com/json-iterator/go v1.1.12 // indirect
+	github.com/mailru/easyjson v0.7.7 // indirect
+	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
+	github.com/modern-go/reflect2 v1.0.2 // indirect
+	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+	github.com/spf13/pflag v1.0.5 // indirect
+	golang.org/x/net v0.8.0 // indirect
+	golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect
+	golang.org/x/sys v0.6.0 // indirect
+	golang.org/x/term v0.6.0 // indirect
+	golang.org/x/text v0.8.0 // indirect
+	golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect
+	google.golang.org/appengine v1.6.7 // indirect
+	google.golang.org/genproto v0.0.0-20220502173005-c8bf987b8c21 // indirect
+	google.golang.org/grpc v1.51.0 // indirect
+	google.golang.org/protobuf v1.28.1 // indirect
+	gopkg.in/inf.v0 v0.9.1 // indirect
+	gopkg.in/yaml.v2 v2.4.0 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+	k8s.io/api v0.27.2 // indirect
+	k8s.io/apimachinery v0.27.2 // indirect
+	k8s.io/client-go v0.27.2 // indirect
+	k8s.io/klog v1.0.0 // indirect
+	k8s.io/klog/v2 v2.90.1 // indirect
+	k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f // indirect
+	k8s.io/kubelet v0.27.2 // indirect
+	k8s.io/utils v0.0.0-20230209194617-a36077c30491 // indirect
+	sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
+	sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
+	sigs.k8s.io/yaml v1.3.0 // indirect
+)
--- a/gpu_scheduler_extender/go.sum
+++ b/gpu_scheduler_extender/go.sum
--- a/gpu_scheduler_extender/policy_config.json
+++ b/gpu_scheduler_extender/policy_config.json
+{
+  "kind": "Policy",
+  "apiVersion": "v1",
+  "predicates": [
+    {
+      "name": "PodFitsHostPorts"
+    },
+    {
+      "name": "PodFitsResources"
+    },
+    {
+      "name": "NoDiskConflict"
+    },
+    {
+      "name": "MatchNodeSelector"
+    },
+    {
+      "name": "HostName"
+    }
+  ],
+  "extenders": [
+    {
+      "urlPrefix": "http://127.0.0.1:3456/scheduler",
+      "apiVersion": "v1beta1",
+      "filterVerb": "predicates",
+      "enableHttps": false,
+      "nodeCacheCapable": false
+    }
+  ],
+  "hardPodAffinitySymmetricWeight": 10,
+  "alwaysCheckAllPredicates": false
+}
\ No newline at end of file
--- a/gpu_scheduler_extender/scheduler_ext
+++ b/gpu_scheduler_extender/scheduler_ext
--- a/gpu_scheduler_extender/scheduler_ext.go
+++ b/gpu_scheduler_extender/scheduler_ext.go
+package main
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"time"
+
+	"google.golang.org/grpc"
+	corev1 "k8s.io/api/core/v1"
+	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/tools/cache"
+	"k8s.io/client-go/tools/clientcmd"
+	"k8s.io/client-go/util/workqueue"
+	"k8s.io/klog"
+	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
+)
+
+const (
+	pluginName    = "scheduler_ext"
+	gpuMemoryKey  = "mps.iitb/gpu-vmemory"
+	gpuCoresKey   = "mps.iitb/gpu-vcore"
+	resyncPeriod  = 5 * time.Minute
+	defaultWeight = 1
+)
+
+type MPSDevicePluginScheduler struct {
+	clientset    kubernetes.Interface
+	queue        workqueue.RateLimitingInterface
+	deviceConn   *grpc.ClientConn
+	deviceClient pluginapi.DevicePluginClient
+}
+
+func main() {
+	kubeconfig := os.Getenv("KUBECONFIG")
+	config, err := clientcmd.BuildConfigFromFlags("", kubeconfig)
+	if err != nil {
+		klog.Fatalf("Error building kubeconfig: %v", err)
+	}
+
+	clientset, err := kubernetes.NewForConfig(config)
+	if err != nil {
+		klog.Fatalf("Error creating clientset: %v", err)
+	}
+
+	deviceConn, err := grpc.Dial("unix:/var/lib/kubelet/device-plugins/gpu-device-plugin.sock", grpc.WithInsecure())
+	if err != nil {
+		klog.Fatalf("Error connecting to device plugin: %v", err)
+	}
+
+	deviceClient := pluginapi.NewDevicePluginClient(deviceConn)
+
+	scheduler := &MPSDevicePluginScheduler{
+		clientset:    clientset,
+		queue:        workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()),
+		deviceConn:   deviceConn,
+		deviceClient: deviceClient,
+	}
+
+	stopCh := make(chan struct{})
+	defer close(stopCh)
+
+	go scheduler.run(stopCh)
+
+	wait.Until(func() {
+		if err := scheduler.syncPods(); err != nil {
+			klog.Errorf("Error syncing pods: %v", err)
+		}
+	}, resyncPeriod, stopCh)
+
+	<-stopCh
+}
+
+func (s *MPSDevicePluginScheduler) run(stopCh <-chan struct{}) {
+	go wait.Until(s.worker, time.Second, stopCh)
+	<-stopCh
+}
+
+func (s *MPSDevicePluginScheduler) worker() {
+	for s.processNextItem() {
+	}
+}
+
+func (s *MPSDevicePluginScheduler) processNextItem() bool {
+	obj, shutdown := s.queue.Get()
+	if shutdown {
+		return false
+	}
+	err := func(obj interface{}) error {
+		defer s.queue.Done(obj)
+		var key string
+		var ok bool
+		if key, ok = obj.(string); !ok {
+			s.queue.Done(obj)
+			klog.Errorf("Expected string in workqueue, but got %#v", obj)
+			return nil
+		}
+		if err := s.syncPod(key); err != nil {
+			return fmt.Errorf("error syncing pod '%s': %v", key, err)
+		}
+		s.queue.Forget(obj)
+		klog.Infof("Successfully synced pod '%s'", key)
+		return nil
+	}(obj)
+	if err != nil {
+		klog.Error(err)
+		return true
+	}
+	return true
+}
+
+func (s *MPSDevicePluginScheduler) syncPod(key string) error {
+	namespace, name, err := cache.SplitMetaNamespaceKey(key)
+	if err != nil {
+		return err
+	}
+
+	pod, err := s.clientset.CoreV1().Pods(namespace).Get(context.TODO(), name, v1.GetOptions{})
+	if err != nil {
+		return err
+	}
+
+	if isPodScheduled(pod) {
+		return nil
+	}
+
+	nodeName, err := s.findBestNode(pod)
+	if err != nil {
+		return err
+	}
+
+	boundPod := &corev1.Pod{
+		ObjectMeta: v1.ObjectMeta{
+			Namespace: pod.Namespace,
+			Name:      pod.Name,
+		},
+		Spec:   pod.Spec,
+		Status: pod.Status,
+	}
+	boundPod.Spec.NodeName = nodeName
+	if _, err := s.clientset.CoreV1().Pods(pod.Namespace).Update(context.TODO(), boundPod, v1.UpdateOptions{}); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (s *MPSDevicePluginScheduler) findBestNode(pod *corev1.Pod) (string, error) {
+	nodes, err := s.clientset.CoreV1().Nodes().List(context.TODO(), v1.ListOptions{
+		LabelSelector: "gpu=true",
+	})
+	if err != nil {
+		return "", err
+	}
+
+	var bestNode string
+	bestScore := -1
+
+	for _, node := range nodes.Items {
+		if !s.isMPSAvailable(node) {
+			continue
+		}
+
+		score := s.calculateNodeScore(node, pod)
+		if score > bestScore {
+			bestNode = node.Name
+			bestScore = score
+		}
+	}
+
+	return bestNode, nil
+}
+
+func (s *MPSDevicePluginScheduler) isMPSAvailable(node corev1.Node) bool {
+	if node.Labels == nil {
+		return false
+	}
+
+	_, mpsEnabled := node.Labels["mps-enabled"]
+	return mpsEnabled
+}
+
+func (s *MPSDevicePluginScheduler) calculateNodeScore(node corev1.Node, pod *corev1.Pod) int {
+	score := 0
+
+	for _, container := range pod.Spec.Containers {
+		if container.Resources.Requests != nil {
+			gpuMemoryReq, ok := container.Resources.Requests[corev1.ResourceName(gpuMemoryKey)]
+			if !ok {
+				continue
+			}
+			gpuMemoryNode, ok := node.Status.Allocatable[corev1.ResourceName(gpuMemoryKey)]
+			if !ok {
+				continue
+			}
+			if gpuMemoryReq.Value() <= gpuMemoryNode.Value() {
+				score++
+			} else {
+				score--
+			}
+		}
+
+		if container.Resources.Requests != nil {
+			gpuCoresReq, ok := container.Resources.Requests[corev1.ResourceName(gpuCoresKey)]
+			if !ok {
+				continue
+			}
+			gpuCoresNode, ok := node.Status.Allocatable[corev1.ResourceName(gpuCoresKey)]
+			if !ok {
+				continue
+			}
+			if gpuCoresReq.Value() <= gpuCoresNode.Value() {
+				score++
+			} else {
+				score--
+			}
+		}
+	}
+
+	return score
+}
+
+func (s *MPSDevicePluginScheduler) syncPods() error {
+	pods, err := s.clientset.CoreV1().Pods("").List(context.TODO(), v1.ListOptions{
+		FieldSelector: "status.phase!=Succeeded,status.phase!=Failed",
+	})
+	if err != nil {
+		return err
+	}
+
+	for _, pod := range pods.Items {
+		key, err := cache.MetaNamespaceKeyFunc(&pod)
+		if err != nil {
+			klog.Errorf("Error getting key for pod '%s/%s': %v", pod.Namespace, pod.Name, err)
+			continue
+		}
+		s.queue.Add(key)
+	}
+
+	return nil
+}
+
+func isPodScheduled(pod *corev1.Pod) bool {
+	return pod.Spec.NodeName != ""
+}
--- a/gpu_scheduler_extender/scheduler_extender.yaml
+++ b/gpu_scheduler_extender/scheduler_extender.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gpu-scheduler-extender
+  namespace: kube-system
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gpu-scheduler-extender
+  template:
+    metadata:
+      labels:
+        app: gpu-scheduler-extender
+    spec:
+      containers:
+        - name: gpu-scheduler-extender
+          image: xzaviourr/gpu-scheduler-extender:v3
+          imagePullPolicy: IfNotPresent
+          resources:
+            limits:
+              cpu: 200m
+              memory: 200Mi
+            requests:
+              cpu: 100m
+              memory: 100Mi
+          volumeMounts:
+            - name: kubeconfig-volume
+              mountPath: /root/.kube/config
+              readOnly: true
+            - name: socket-volume
+              mountPath: /var/lib/kubelet/device-plugins
+      volumes:
+        - name: kubeconfig-volume
+          hostPath:
+            path: /root/.kube/config
+            type: File
+        - name: socket-volume
+          hostPath:
+            path: /var/lib/kubelet/device-plugins
+            type: Directory
\ No newline at end of file
--- a/pod_request.yaml
+++ b/pod_request.yaml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: cuda-test
+spec:
+  restartPolicy: OnFailure
+    #automountServiceAccountToken: false
+    #nodeSelector:
+    #node: "worker1"
+  containers:
+  - name: cuda-vector-add
+    image: "nvidia/samples:vectoradd-cuda10.2"
+    resources:
+      requests:
+        xzaviourr/gpu-vcore: 5
+      limits:         
+        xzaviourr/gpu-vcore: 5
\ No newline at end of file
--- a/readme.md
+++ b/readme.md
+sudo su
+cd /var/lib/kubelet/device-plugins
+ls
+
+kubectl get daemonsets --all-namespaces
+kubectl describe node ub-10
+sudo lsof -n -i -P | grep kubelet.sock
+kubectl get pods --all-namespace
+kubectl logs gpu-device-plugin-daemonset-s684c --namespace gpu-device-plugin-namespace
--- a/reset_cluster.sh
+++ b/reset_cluster.sh
+#!/bin/bash
+
+# Setup kubernetes cluster
+sudo kubeadm reset   # Delete existing master
+rm $HOME/.kube/config
+sudo rm -rf /etc/cni/net.d
+sudo swapoff -a     # Swapoff
+sudo kubeadm init --pod-network-cidr=10.244.0.0/16  # Initialize cluster
+mkdir -p $HOME/.kube    
+sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
+sudo chown $(id -u):$(id -g) $HOME/.kube/config
+kubectl apply -f https://raw.githubusercontent.com/flannel-io/flannel/master/Documentation/kube-flannel.yml     # Use flannel for networking
+kubectl taint nodes ub-10 node-role.kubernetes.io/master-   # Allow device plugins and pods to run on master
+kubectl label node ub-10 mps-gpu-enabled=true   # Add device plugin label
+
+# Delete daemonset
+# kubectl delete daemonset gpu-device-plugin-daemonset
+# kubectl delete clusterrolebinding gpu-device-plugin-manager-role
+
+# Attach daemonset again
+# kubectl create namespace gpu-device-plugin-namespace
+kubectl create sa gpu-device-plugin-manager -n kube-system
+kubectl create clusterrolebinding gpu-device-plugin-manager-role --clusterrole=cluster-admin --serviceaccount=kube-system:gpu-device-plugin-manager
+kubectl apply -f gpu_device_plugin/mps-manager.yaml
+
+# Start scheduler extender
+cp $HOME/.kube/config /root/.kube/config
+kubectl apply -f gpu_scheduler_extender/scheduler_extender.yaml
+# kubectl apply -f gpu_scheduler_extender/scheduler_config.yaml
\ No newline at end of file