Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
a23a92d1
Commit
a23a92d1
authored
Jun 20, 2017
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
GPU Utilization Tracker
parent
b7ee409b
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
81 additions
and
3 deletions
+81
-3
tensorpack/callbacks/stats.py
tensorpack/callbacks/stats.py
+71
-1
tensorpack/utils/concurrency.py
tensorpack/utils/concurrency.py
+5
-2
tensorpack/utils/stats.py
tensorpack/utils/stats.py
+5
-0
No files found.
tensorpack/callbacks/stats.py
View file @
a23a92d1
...
...
@@ -3,11 +3,16 @@
# Author: Yuxin Wu <ppwwyyxxc@gmail.com>
import
os
import
numpy
as
np
import
multiprocessing
as
mp
import
time
from
six.moves
import
map
from
.base
import
Callback
from
..utils
import
logger
from
..utils.concurrency
import
ensure_proc_terminate
,
subproc_call
__all__
=
[
'SendStat'
]
__all__
=
[
'SendStat'
,
'GPUUtilizationTracker'
]
class
SendStat
(
Callback
):
...
...
@@ -25,3 +30,68 @@ class SendStat(Callback):
ret
=
os
.
system
(
cmd
)
if
ret
!=
0
:
logger
.
error
(
"Command {} failed with ret={}!"
.
format
(
cmd
,
ret
))
class
GPUUtilizationTracker
(
Callback
):
""" Summarize the average GPU utilization within an epoch"""
def
__init__
(
self
,
devices
):
"""
Args:
devices (list[int]): physical GPU ids
"""
self
.
_devices
=
list
(
map
(
str
,
devices
))
self
.
_command
=
"nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i "
+
\
','
.
join
(
self
.
_devices
)
output
,
ret
=
subproc_call
(
self
.
_command
)
assert
ret
==
0
,
"Cannot fetch GPU utilization!"
def
_before_train
(
self
):
self
.
_evt
=
mp
.
Event
()
self
.
_stop_evt
=
mp
.
Event
()
self
.
_queue
=
mp
.
Queue
()
self
.
_proc
=
mp
.
Process
(
target
=
self
.
worker
,
args
=
(
self
.
_evt
,
self
.
_queue
,
self
.
_stop_evt
))
ensure_proc_terminate
(
self
.
_proc
)
self
.
_proc
.
start
()
def
_before_epoch
(
self
):
self
.
_evt
.
set
()
def
_after_epoch
(
self
):
while
self
.
_evt
.
is_set
():
# unlikely
pass
self
.
_evt
.
set
()
stats
=
self
.
_queue
.
get
()
for
idx
,
dev
in
enumerate
(
self
.
_devices
):
self
.
trainer
.
monitors
.
put_scalar
(
'GPU{}-Util'
.
format
(
dev
),
stats
[
idx
])
def
_after_train
(
self
):
self
.
_stop_evt
.
set
()
self
.
_evt
.
set
()
self
.
_proc
.
join
()
def
worker
(
self
,
evt
,
rst_queue
,
stop_evt
):
while
True
:
evt
.
wait
()
# start epoch
evt
.
clear
()
if
stop_evt
.
is_set
():
# or on exit
return
stats
=
np
.
zeros
((
len
(
self
.
_devices
),),
dtype
=
'f4'
)
cnt
=
0
while
True
:
time
.
sleep
(
1
)
output
,
retv
=
subproc_call
(
self
.
_command
)
assert
retv
==
0
,
"Cannot fetch GPU Utilization!"
data
=
list
(
map
(
float
,
output
.
strip
()
.
split
(
b
'
\n
'
)))
stats
+=
data
cnt
+=
1
if
evt
.
is_set
():
# stop epoch
if
stop_evt
.
is_set
():
# or on exit
return
evt
.
clear
()
rst_queue
.
put
(
stats
/
cnt
)
break
tensorpack/utils/concurrency.py
View file @
a23a92d1
...
...
@@ -199,7 +199,7 @@ def subproc_call(cmd, timeout=None):
timeout(float): timeout in seconds.
Returns:
output(
str
), retcode(int). If timeout, retcode is -1.
output(
bytes
), retcode(int). If timeout, retcode is -1.
"""
try
:
output
=
subprocess
.
check_output
(
...
...
@@ -211,9 +211,12 @@ def subproc_call(cmd, timeout=None):
logger
.
warn
(
e
.
output
)
return
e
.
output
,
-
1
except
subprocess
.
CalledProcessError
as
e
:
logger
.
warn
(
"Comm
na
d failed: {}"
.
format
(
e
.
returncode
))
logger
.
warn
(
"Comm
an
d failed: {}"
.
format
(
e
.
returncode
))
logger
.
warn
(
e
.
output
)
return
e
.
output
,
e
.
returncode
except
Exception
:
logger
.
warn
(
"Command failed to run: {}"
.
format
(
cmd
))
return
""
,
-
2
class
OrderedContainer
(
object
):
...
...
tensorpack/utils/stats.py
View file @
a23a92d1
...
...
@@ -42,6 +42,11 @@ class StatCounter(object):
assert
len
(
self
.
_values
)
return
max
(
self
.
_values
)
@
property
def
min
(
self
):
assert
len
(
self
.
_values
)
return
min
(
self
.
_values
)
class
RatioCounter
(
object
):
""" A counter to count ratio of something. """
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment