Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
K
k8_with_mps
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Anshul
k8_with_mps
Commits
64c132ae
Commit
64c132ae
authored
May 30, 2023
by
Anshul
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added code for new device plugin
parent
5d29e9d6
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
289 additions
and
0 deletions
+289
-0
device_plugin/Dockerfile
device_plugin/Dockerfile
+11
-0
device_plugin/go.mod
device_plugin/go.mod
+16
-0
device_plugin/go.sum
device_plugin/go.sum
+53
-0
device_plugin/mps-device-plugin
device_plugin/mps-device-plugin
+0
-0
device_plugin/mps-device-plugin.go
device_plugin/mps-device-plugin.go
+185
-0
device_plugin/start_cluster.sh
device_plugin/start_cluster.sh
+24
-0
No files found.
device_plugin/Dockerfile
0 → 100644
View file @
64c132ae
FROM
golang:1.20.4-alpine3.18
WORKDIR
/app
COPY
mps-device-plugin.go .
RUN
go mod init mps-device-plugin
RUN
go get google.golang.org/grpc
RUN
go get k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1
ENTRYPOINT
["./mps-device_plugin"]
\ No newline at end of file
device_plugin/go.mod
0 → 100644
View file @
64c132ae
module mps-device-plugin
go 1.20
require (
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/glog v1.1.1 // indirect
github.com/golang/protobuf v1.5.3 // indirect
golang.org/x/net v0.8.0 // indirect
golang.org/x/sys v0.6.0 // indirect
golang.org/x/text v0.8.0 // indirect
google.golang.org/genproto v0.0.0-20230306155012-7f2fa6fef1f4 // indirect
google.golang.org/grpc v1.55.0 // indirect
google.golang.org/protobuf v1.30.0 // indirect
k8s.io/kubelet v0.27.2 // indirect
)
device_plugin/go.sum
0 → 100644
View file @
64c132ae
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/golang/glog v1.1.1 h1:jxpi2eWoU84wbX9iIEyAeeoac3FLuifZpY9tcNUD9kw=
github.com/golang/glog v1.1.1/go.mod h1:zR+okUeTbrL6EL3xHUDxZuEtGv04p5shwip1+mL/rLQ=
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg=
github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.8.0 h1:57P1ETyNKtuIjB4SRd15iJxuhj8Gc416Y78H3qgMh68=
golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/genproto v0.0.0-20230306155012-7f2fa6fef1f4 h1:DdoeryqhaXp1LtT/emMP1BRJPHHKFi5akj/nbx/zNTA=
google.golang.org/genproto v0.0.0-20230306155012-7f2fa6fef1f4/go.mod h1:NWraEVixdDnqcqQ30jipen1STv2r/n24Wb7twVTGR4s=
google.golang.org/grpc v1.55.0 h1:3Oj82/tFSCeUrRTg/5E/7d/W5A1tj6Ky1ABAuZuv5ag=
google.golang.org/grpc v1.55.0/go.mod h1:iYEXKGkEBhg1PjZQvoYEVPTDkHo1/bjTnfwTeGONTY8=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
k8s.io/kubelet v0.27.2 h1:vpJnBkqQjxItEhehKG0toXoZ+G+tf4UXAOqtMJy6qgc=
k8s.io/kubelet v0.27.2/go.mod h1:1SVrHaLnuw53nQJx8036k9HjE0teDXZtbN51cYC0HSc=
device_plugin/mps-device-plugin
0 → 100755
View file @
64c132ae
File added
device_plugin/mps-device-plugin.go
0 → 100644
View file @
64c132ae
package
main
import
(
"context"
"fmt"
"net"
"strconv"
"github.com/golang/glog"
"google.golang.org/grpc"
pluginapi
"k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)
const
(
mpsControlBin
=
"/usr/local/nvidia/bin/nvidia-cuda-mps-control"
mpsActiveThreadCmd
=
"get_default_active_thread_percentage"
mpsMemLimitEnv
=
"CUDA_MPS_PINNED_DEVICE_MEM_LIMIT"
mpsThreadLimitEnv
=
"CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"
socketPlugin
=
"/var/lib/kubelet/device-plugins/mps-device-plugin.sock"
)
var
(
computeResourceName
=
"nvidia-mps.com/vcore"
memResourceName
=
"nvidia-mps.com/vmemory"
)
type
mpsGPUManager
struct
{
grpcServer
*
grpc
.
Server
devices
map
[
string
]
pluginapi
.
Device
socket
string
computePartitionSize
int
// In utilization (eg. 1%)
memPartitionSize
int
// In MB (eg. 256MB)
}
func
NewMpsGPUManager
(
computePartitionSize
,
memPartitionSize
int
)
*
mpsGPUManager
{
return
&
mpsGPUManager
{
devices
:
make
(
map
[
string
]
pluginapi
.
Device
),
computePartitionSize
:
computePartitionSize
,
memPartitionSize
:
memPartitionSize
,
socket
:
socketPlugin
,
}
}
func
(
mgm
*
mpsGPUManager
)
ListDevices
()
[]
*
pluginapi
.
Device
{
gpuMemoryAvailable
:=
16384
// Using static value for now
computeDevicesCount
:=
100
/
mgm
.
computePartitionSize
memoryDevicesCount
:=
gpuMemoryAvailable
/
mgm
.
memPartitionSize
virtualDevices
:=
make
([]
*
pluginapi
.
Device
,
computeDevicesCount
+
memoryDevicesCount
)
for
i
:=
0
;
i
<
computeDevicesCount
;
i
++
{
virtualDeviceID
:=
fmt
.
Sprintf
(
"%s-%d"
,
computeResourceName
,
i
)
virtualDevices
[
i
]
=
&
pluginapi
.
Device
{
ID
:
virtualDeviceID
,
Health
:
pluginapi
.
Healthy
,
}
}
for
i
:=
0
;
i
<
memoryDevicesCount
;
i
++
{
virtualDeviceID
:=
fmt
.
Sprintf
(
"%s-%d"
,
memResourceName
,
i
)
virtualDevices
[
computeDevicesCount
+
i
]
=
&
pluginapi
.
Device
{
ID
:
virtualDeviceID
,
Health
:
pluginapi
.
Healthy
,
}
}
return
virtualDevices
}
// func (mgm *mpsGPUManager) isMpsHealthy() error {
// glog.Infof("Checking status of MPS on the node")
// var out bytes.Buffer
// reader, writer := io.Pipe()
// defer writer.Close()
// defer reader.Close()
// mpsCmd := exec.Command(mpsControlBin)
// mpsCmd.Stdin = reader
// mpsCmd.Stdout = &out
// err := mpsCmd.Start()
// if err != nil {
// return fmt.Errorf("failed to start NVIDIA MPS health check: %v", err)
// }
// writer.Write([]byte(mpsActiveThreadCmd))
// writer.Close()
// err = mpsCmd.Wait()
// if err != nil {
// return fmt.Errorf("failed to health check NVIDIA MPS: %v", err)
// }
// reader.Close()
// glog.Infof("MPS is healthy, active thread percentage = %s", out.String())
// return nil
// }
func
(
mgm
*
mpsGPUManager
)
Start
()
error
{
glog
.
Infof
(
"Starting MPS GPU Manager"
)
lis
,
err
:=
net
.
Listen
(
"unix"
,
mgm
.
socket
)
if
err
!=
nil
{
return
err
}
mgm
.
grpcServer
=
grpc
.
NewServer
()
pluginapi
.
RegisterDevicePluginServer
(
mgm
.
grpcServer
,
mgm
)
glog
.
Infof
(
"MPS GPU Manager registered with the kubelet"
)
return
mgm
.
grpcServer
.
Serve
(
lis
)
}
func
(
mgm
*
mpsGPUManager
)
Stop
()
{
if
mgm
.
grpcServer
!=
nil
{
mgm
.
grpcServer
.
Stop
()
}
glog
.
Infof
(
"MPS GPU Manager stopped"
)
}
func
(
mgm
*
mpsGPUManager
)
ListAndWatch
(
empty
*
pluginapi
.
Empty
,
stream
pluginapi
.
DevicePlugin_ListAndWatchServer
)
error
{
resp
:=
new
(
pluginapi
.
ListAndWatchResponse
)
resp
.
Devices
=
mgm
.
ListDevices
()
if
err
:=
stream
.
Send
(
resp
);
err
!=
nil
{
glog
.
Infof
(
"Error sending device list : %v"
,
err
)
return
err
}
glog
.
Infof
(
"Successfully sent the list of devices ..."
)
select
{}
}
func
(
mgm
*
mpsGPUManager
)
Allocate
(
ctx
context
.
Context
,
rqt
*
pluginapi
.
AllocateRequest
)
(
*
pluginapi
.
AllocateResponse
,
error
)
{
allocateResponse
:=
&
pluginapi
.
AllocateResponse
{}
for
_
,
req
:=
range
rqt
.
ContainerRequests
{
containerAllocateResponse
:=
&
pluginapi
.
ContainerAllocateResponse
{}
totalCompute
:=
len
(
req
.
DevicesIDs
)
envs
:=
make
(
map
[
string
]
string
)
envs
[
"CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"
]
=
strconv
.
Itoa
(
totalCompute
)
envs
[
"CUDA_MPS_PINNED_DEVICE_MEM_LIMIT"
]
=
"0=2G"
mounts
:=
[]
*
pluginapi
.
Mount
{
{
ContainerPath
:
"/usr/local/nvidia"
,
HostPath
:
"/usr/local/nvidia"
,
},
}
containerAllocateResponse
.
Envs
=
envs
containerAllocateResponse
.
Mounts
=
mounts
allocateResponse
.
ContainerResponses
=
append
(
allocateResponse
.
ContainerResponses
,
containerAllocateResponse
)
glog
.
Infof
(
"Successfully allocated the devices ..."
)
}
return
allocateResponse
,
nil
}
func
(
mgm
*
mpsGPUManager
)
GetDevicePluginOptions
(
context
.
Context
,
*
pluginapi
.
Empty
)
(
*
pluginapi
.
DevicePluginOptions
,
error
)
{
return
&
pluginapi
.
DevicePluginOptions
{
PreStartRequired
:
false
,
GetPreferredAllocationAvailable
:
false
,
},
nil
}
func
(
mgm
*
mpsGPUManager
)
GetPreferredAllocation
(
ctx
context
.
Context
,
rqt
*
pluginapi
.
PreferredAllocationRequest
)
(
*
pluginapi
.
PreferredAllocationResponse
,
error
)
{
preferredAllocateResponse
:=
&
pluginapi
.
PreferredAllocationResponse
{}
for
_
,
req
:=
range
rqt
.
ContainerRequests
{
containerAllocateResponse
:=
&
pluginapi
.
ContainerPreferredAllocationResponse
{}
totalDevicesToAllot
:=
req
.
AllocationSize
for
i
:=
0
;
i
<
len
(
req
.
MustIncludeDeviceIDs
);
i
++
{
containerAllocateResponse
.
DeviceIDs
=
append
(
containerAllocateResponse
.
DeviceIDs
,
req
.
MustIncludeDeviceIDs
[
i
])
totalDevicesToAllot
--
}
preferredAllocateResponse
.
ContainerResponses
=
append
(
preferredAllocateResponse
.
ContainerResponses
,
containerAllocateResponse
)
}
return
preferredAllocateResponse
,
nil
}
func
(
mgm
*
mpsGPUManager
)
PreStartContainer
(
context
.
Context
,
*
pluginapi
.
PreStartContainerRequest
)
(
*
pluginapi
.
PreStartContainerResponse
,
error
)
{
preStartContainerResponse
:=
pluginapi
.
PreStartContainerResponse
{}
return
&
preStartContainerResponse
,
nil
}
func
main
()
{
mgm
:=
NewMpsGPUManager
(
1
,
256
)
defer
mgm
.
Stop
()
if
err
:=
mgm
.
Start
();
err
!=
nil
{
glog
.
Fatalf
(
"Error starting the MPS GPU Manager : %v"
,
err
)
}
}
device_plugin/start_cluster.sh
0 → 100644
View file @
64c132ae
#!/bin/bash
# Setup kubernetes cluster
sudo
kubeadm reset
# Delete existing master
rm
$HOME
/.kube/config
sudo rm
-rf
/etc/cni/net.d
sudo
swapoff
-a
# Swapoff
sudo
kubeadm init
--pod-network-cidr
=
10.244.0.0/16
# Initialize cluster
mkdir
-p
$HOME
/.kube
sudo cp
-i
/etc/kubernetes/admin.conf
$HOME
/.kube/config
sudo chown
$(
id
-u
)
:
$(
id
-g
)
$HOME
/.kube/config
kubectl apply
-f
https://raw.githubusercontent.com/flannel-io/flannel/master/Documentation/kube-flannel.yml
# Use flannel for networking
kubectl taint nodes ub-10 node-role.kubernetes.io/master-
# Allow device plugins and pods to run on master
kubectl label node ub-10 mps-gpu-enabled
=
true
# Add device plugin label
# Delete daemonset
# kubectl delete daemonset gpu-device-plugin-daemonset
# kubectl delete clusterrolebinding gpu-device-plugin-manager-role
# Attach daemonset again
# kubectl create namespace gpu-device-plugin-namespace
kubectl create sa gpu-device-plugin-manager
-n
kube-system
kubectl create clusterrolebinding gpu-device-plugin-manager-role
--clusterrole
=
cluster-admin
--serviceaccount
=
kube-system:gpu-device-plugin-manager
kubectl apply
-f
gpu_device_plugin/mps-manager.yaml
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment