Commit a23a92d1 authored by Yuxin Wu's avatar Yuxin Wu

GPU Utilization Tracker

parent b7ee409b
...@@ -3,11 +3,16 @@ ...@@ -3,11 +3,16 @@
# Author: Yuxin Wu <ppwwyyxxc@gmail.com> # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
import os import os
import numpy as np
import multiprocessing as mp
import time
from six.moves import map
from .base import Callback from .base import Callback
from ..utils import logger from ..utils import logger
from ..utils.concurrency import ensure_proc_terminate, subproc_call
__all__ = ['SendStat'] __all__ = ['SendStat', 'GPUUtilizationTracker']
class SendStat(Callback): class SendStat(Callback):
...@@ -25,3 +30,68 @@ class SendStat(Callback): ...@@ -25,3 +30,68 @@ class SendStat(Callback):
ret = os.system(cmd) ret = os.system(cmd)
if ret != 0: if ret != 0:
logger.error("Command {} failed with ret={}!".format(cmd, ret)) logger.error("Command {} failed with ret={}!".format(cmd, ret))
class GPUUtilizationTracker(Callback):
""" Summarize the average GPU utilization within an epoch"""
def __init__(self, devices):
"""
Args:
devices (list[int]): physical GPU ids
"""
self._devices = list(map(str, devices))
self._command = "nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i " + \
','.join(self._devices)
output, ret = subproc_call(self._command)
assert ret == 0, "Cannot fetch GPU utilization!"
def _before_train(self):
self._evt = mp.Event()
self._stop_evt = mp.Event()
self._queue = mp.Queue()
self._proc = mp.Process(target=self.worker, args=(
self._evt, self._queue, self._stop_evt))
ensure_proc_terminate(self._proc)
self._proc.start()
def _before_epoch(self):
self._evt.set()
def _after_epoch(self):
while self._evt.is_set(): # unlikely
pass
self._evt.set()
stats = self._queue.get()
for idx, dev in enumerate(self._devices):
self.trainer.monitors.put_scalar('GPU{}-Util'.format(dev), stats[idx])
def _after_train(self):
self._stop_evt.set()
self._evt.set()
self._proc.join()
def worker(self, evt, rst_queue, stop_evt):
while True:
evt.wait() # start epoch
evt.clear()
if stop_evt.is_set(): # or on exit
return
stats = np.zeros((len(self._devices),), dtype='f4')
cnt = 0
while True:
time.sleep(1)
output, retv = subproc_call(self._command)
assert retv == 0, "Cannot fetch GPU Utilization!"
data = list(map(float, output.strip().split(b'\n')))
stats += data
cnt += 1
if evt.is_set(): # stop epoch
if stop_evt.is_set(): # or on exit
return
evt.clear()
rst_queue.put(stats / cnt)
break
...@@ -199,7 +199,7 @@ def subproc_call(cmd, timeout=None): ...@@ -199,7 +199,7 @@ def subproc_call(cmd, timeout=None):
timeout(float): timeout in seconds. timeout(float): timeout in seconds.
Returns: Returns:
output(str), retcode(int). If timeout, retcode is -1. output(bytes), retcode(int). If timeout, retcode is -1.
""" """
try: try:
output = subprocess.check_output( output = subprocess.check_output(
...@@ -211,9 +211,12 @@ def subproc_call(cmd, timeout=None): ...@@ -211,9 +211,12 @@ def subproc_call(cmd, timeout=None):
logger.warn(e.output) logger.warn(e.output)
return e.output, -1 return e.output, -1
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
logger.warn("Commnad failed: {}".format(e.returncode)) logger.warn("Command failed: {}".format(e.returncode))
logger.warn(e.output) logger.warn(e.output)
return e.output, e.returncode return e.output, e.returncode
except Exception:
logger.warn("Command failed to run: {}".format(cmd))
return "", -2
class OrderedContainer(object): class OrderedContainer(object):
......
...@@ -42,6 +42,11 @@ class StatCounter(object): ...@@ -42,6 +42,11 @@ class StatCounter(object):
assert len(self._values) assert len(self._values)
return max(self._values) return max(self._values)
@property
def min(self):
assert len(self._values)
return min(self._values)
class RatioCounter(object): class RatioCounter(object):
""" A counter to count ratio of something. """ """ A counter to count ratio of something. """
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment