GPU Utilization Tracker

a23a92d1 · Yuxin Wu · b7ee409b · a23a92d1 · a23a92d1 · a23a92d1
Commit a23a92d1 authored Jun 20, 2017 by Yuxin Wu
Showing with 81 additions and 3 deletions

tensorpack/callbacks/stats.py tensorpack/callbacks/stats.py +71 -1

tensorpack/utils/concurrency.py tensorpack/utils/concurrency.py +5 -2

tensorpack/utils/stats.py tensorpack/utils/stats.py +5 -0

No files found.
--- a/tensorpack/callbacks/stats.py
+++ b/tensorpack/callbacks/stats.py
@@ -3,11 +3,16 @@
 # Author: Yuxin Wu <ppwwyyxxc@gmail.com>

 import os
+import numpy as np
+import multiprocessing as mp
+import time
+from six.moves import map

 from .base import Callback
 from ..utils import logger
+from ..utils.concurrency import ensure_proc_terminate, subproc_call

-__all__ = ['SendStat']
+__all__ = ['SendStat', 'GPUUtilizationTracker']


 class SendStat(Callback):
@@ -25,3 +30,68 @@ class SendStat(Callback):
        ret = os.system(cmd)
        if ret != 0:
            logger.error("Command {} failed with ret={}!".format(cmd, ret))
+
+
+class GPUUtilizationTracker(Callback):
+    """ Summarize the average GPU utilization within an epoch"""
+
+    def __init__(self, devices):
+        """
+        Args:
+            devices (list[int]): physical GPU ids
+        """
+        self._devices = list(map(str, devices))
+
+        self._command = "nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i " + \
+            ','.join(self._devices)
+        output, ret = subproc_call(self._command)
+        assert ret == 0, "Cannot fetch GPU utilization!"
+
+    def _before_train(self):
+        self._evt = mp.Event()
+        self._stop_evt = mp.Event()
+        self._queue = mp.Queue()
+        self._proc = mp.Process(target=self.worker, args=(
+            self._evt, self._queue, self._stop_evt))
+        ensure_proc_terminate(self._proc)
+        self._proc.start()
+
+    def _before_epoch(self):
+        self._evt.set()
+
+    def _after_epoch(self):
+        while self._evt.is_set():   # unlikely
+            pass
+        self._evt.set()
+        stats = self._queue.get()
+        for idx, dev in enumerate(self._devices):
+            self.trainer.monitors.put_scalar('GPU{}-Util'.format(dev), stats[idx])
+
+    def _after_train(self):
+        self._stop_evt.set()
+        self._evt.set()
+        self._proc.join()
+
+    def worker(self, evt, rst_queue, stop_evt):
+        while True:
+            evt.wait()  # start epoch
+            evt.clear()
+            if stop_evt.is_set():   # or on exit
+                return
+
+            stats = np.zeros((len(self._devices),), dtype='f4')
+            cnt = 0
+            while True:
+                time.sleep(1)
+                output, retv = subproc_call(self._command)
+                assert retv == 0, "Cannot fetch GPU Utilization!"
+                data = list(map(float, output.strip().split(b'\n')))
+                stats += data
+                cnt += 1
+
+                if evt.is_set():    # stop epoch
+                    if stop_evt.is_set():   # or on exit
+                        return
+                    evt.clear()
+                    rst_queue.put(stats / cnt)
+                    break
--- a/tensorpack/utils/concurrency.py
+++ b/tensorpack/utils/concurrency.py
@@ -199,7 +199,7 @@ def subproc_call(cmd, timeout=None):
        timeout(float): timeout in seconds.

    Returns:
-        output(str), retcode(int). If timeout, retcode is -1.
+        output(bytes), retcode(int). If timeout, retcode is -1.
    """
    try:
        output = subprocess.check_output(
@@ -211,9 +211,12 @@ def subproc_call(cmd, timeout=None):
        logger.warn(e.output)
        return e.output, -1
    except subprocess.CalledProcessError as e:
-        logger.warn("Commnad failed: {}".format(e.returncode))
+        logger.warn("Command failed: {}".format(e.returncode))
        logger.warn(e.output)
        return e.output, e.returncode
+    except Exception:
+        logger.warn("Command failed to run: {}".format(cmd))
+        return "", -2


 class OrderedContainer(object):

--- a/tensorpack/utils/stats.py
+++ b/tensorpack/utils/stats.py
@@ -42,6 +42,11 @@ class StatCounter(object):
        assert len(self._values)
        return max(self._values)

+    @property
+    def min(self):
+        assert len(self._values)
+        return min(self._values)
+

 class RatioCounter(object):
    """ A counter to count ratio of something. """