Commit a23a92d1 authored by Yuxin Wu's avatar Yuxin Wu

GPU Utilization Tracker

parent b7ee409b
......@@ -3,11 +3,16 @@
# Author: Yuxin Wu <ppwwyyxxc@gmail.com>
import os
import numpy as np
import multiprocessing as mp
import time
from six.moves import map
from .base import Callback
from ..utils import logger
from ..utils.concurrency import ensure_proc_terminate, subproc_call
__all__ = ['SendStat']
__all__ = ['SendStat', 'GPUUtilizationTracker']
class SendStat(Callback):
......@@ -25,3 +30,68 @@ class SendStat(Callback):
ret = os.system(cmd)
if ret != 0:
logger.error("Command {} failed with ret={}!".format(cmd, ret))
class GPUUtilizationTracker(Callback):
""" Summarize the average GPU utilization within an epoch"""
def __init__(self, devices):
"""
Args:
devices (list[int]): physical GPU ids
"""
self._devices = list(map(str, devices))
self._command = "nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i " + \
','.join(self._devices)
output, ret = subproc_call(self._command)
assert ret == 0, "Cannot fetch GPU utilization!"
def _before_train(self):
self._evt = mp.Event()
self._stop_evt = mp.Event()
self._queue = mp.Queue()
self._proc = mp.Process(target=self.worker, args=(
self._evt, self._queue, self._stop_evt))
ensure_proc_terminate(self._proc)
self._proc.start()
def _before_epoch(self):
self._evt.set()
def _after_epoch(self):
while self._evt.is_set(): # unlikely
pass
self._evt.set()
stats = self._queue.get()
for idx, dev in enumerate(self._devices):
self.trainer.monitors.put_scalar('GPU{}-Util'.format(dev), stats[idx])
def _after_train(self):
self._stop_evt.set()
self._evt.set()
self._proc.join()
def worker(self, evt, rst_queue, stop_evt):
while True:
evt.wait() # start epoch
evt.clear()
if stop_evt.is_set(): # or on exit
return
stats = np.zeros((len(self._devices),), dtype='f4')
cnt = 0
while True:
time.sleep(1)
output, retv = subproc_call(self._command)
assert retv == 0, "Cannot fetch GPU Utilization!"
data = list(map(float, output.strip().split(b'\n')))
stats += data
cnt += 1
if evt.is_set(): # stop epoch
if stop_evt.is_set(): # or on exit
return
evt.clear()
rst_queue.put(stats / cnt)
break
......@@ -199,7 +199,7 @@ def subproc_call(cmd, timeout=None):
timeout(float): timeout in seconds.
Returns:
output(str), retcode(int). If timeout, retcode is -1.
output(bytes), retcode(int). If timeout, retcode is -1.
"""
try:
output = subprocess.check_output(
......@@ -211,9 +211,12 @@ def subproc_call(cmd, timeout=None):
logger.warn(e.output)
return e.output, -1
except subprocess.CalledProcessError as e:
logger.warn("Commnad failed: {}".format(e.returncode))
logger.warn("Command failed: {}".format(e.returncode))
logger.warn(e.output)
return e.output, e.returncode
except Exception:
logger.warn("Command failed to run: {}".format(cmd))
return "", -2
class OrderedContainer(object):
......
......@@ -42,6 +42,11 @@ class StatCounter(object):
assert len(self._values)
return max(self._values)
@property
def min(self):
assert len(self._values)
return min(self._values)
class RatioCounter(object):
""" A counter to count ratio of something. """
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment