use NVML for GPUUtilizationTracker

a5070b4e · Yuxin Wu · 5667a220 · a5070b4e · a5070b4e
Commit a5070b4e authored Feb 11, 2018 by Yuxin Wu
Hide whitespace changes
Inline Side-by-side

Showing with 35 additions and 38 deletions

docs/tutorial/symbolic.md docs/tutorial/symbolic.md +11 -11

tensorpack/callbacks/prof.py tensorpack/callbacks/prof.py +24 -27

No files found.
--- a/docs/tutorial/symbolic.md
+++ b/docs/tutorial/symbolic.md
@@ -17,29 +17,29 @@ You can start using `tf.layers` today as long as it fits your need.

 The following code:
 ```python
-with argscope(Conv2D, out_channel=32, kernel_shape=3, nl=tf.nn.relu):
+with argscope(Conv2D, filters=32, kernel_size=3, activation=tf.nn.relu):
  l = (LinearWrap(image)  # the starting brace is only for line-breaking
       .Conv2D('conv0')
       .MaxPooling('pool0', 2)
       .Conv2D('conv1', padding='SAME')
-       .Conv2D('conv2', kernel_shape=5)
-       .FullyConnected('fc0', 512, nl=tf.nn.relu)
-       .Dropout('dropout', 0.5)
+       .Conv2D('conv2', kernel_size=5)
+       .FullyConnected('fc0', 512, activation=tf.nn.relu)
+       .Dropout('dropout', rate=0.5)
       .tf.multiply(0.5)
       .apply(func, *args, **kwargs)
-       .FullyConnected('fc1', out_dim=10, nl=tf.identity)())
+       .FullyConnected('fc1', units=10, activation=tf.identity)())
 ```
 is equivalent to:
 ```
-l = Conv2D('conv0', image, 32, 3, nl=tf.nn.relu)
+l = Conv2D('conv0', image, 32, 3, activation=tf.nn.relu)
 l = MaxPooling('pool0', l, 2)
-l = Conv2D('conv1', l, 32, 3, padding='SAME', nl=tf.nn.relu)
-l = Conv2D('conv2', l, 32, 5, nl=tf.nn.relu)
-l = FullyConnected('fc0', l, 512, nl=tf.nn.relu)
-l = Dropout('dropout', l, 0.5)
+l = Conv2D('conv1', l, 32, 3, padding='SAME', activation=tf.nn.relu)
+l = Conv2D('conv2', l, 32, 5, activation=tf.nn.relu)
+l = FullyConnected('fc0', l, 512, activation=tf.nn.relu)
+l = Dropout('dropout', l, rate=0.5)
 l = tf.multiply(l, 0.5)
 l = func(l, *args, **kwargs)
-l = FullyConnected('fc1', l, 10, nl=tf.identity)
+l = FullyConnected('fc1', l, 10, activation=tf.identity)
 ```

 ### Access Relevant Tensors

--- a/tensorpack/callbacks/prof.py
+++ b/tensorpack/callbacks/prof.py
@@ -13,8 +13,9 @@ from tensorflow.python.client import timeline

 from .base import Callback
 from ..utils import logger
-from ..utils.concurrency import ensure_proc_terminate, subproc_call, start_proc_mask_signal
+from ..utils.concurrency import ensure_proc_terminate, start_proc_mask_signal
 from ..utils.gpu import get_nr_gpu
+from ..utils.nvml import NVMLContext

 __all__ = ['GPUUtilizationTracker', 'GraphProfiler', 'PeakMemoryTracker']

@@ -37,23 +38,18 @@ class GPUUtilizationTracker(Callback):
        if devices is None:
            env = os.environ.get('CUDA_VISIBLE_DEVICES')
            if env is None:
-                self._devices = list(map(str, range(get_nr_gpu())))
+                self._devices = list(range(get_nr_gpu()))
                logger.warn("[GPUUtilizationTracker] Both devices and CUDA_VISIBLE_DEVICES are None! "
                            "Will monitor all {} visible GPUs!".format(len(self._devices)))
            else:
                if len(env):
-                    self._devices = env.split(',')
+                    self._devices = list(map(int, env.split(',')))
                else:
                    self._devices = []
        else:
-            self._devices = list(map(str, devices))
+            self._devices = devices
        assert len(self._devices), "[GPUUtilizationTracker] No GPU device given!"

-        self._command = "nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i " + \
-            ','.join(self._devices)
-        _, ret = subproc_call(self._command)
-        assert ret == 0, "Cannot fetch GPU utilization!"
-
    def _before_train(self):
        self._evt = mp.Event()
        self._stop_evt = mp.Event()
@@ -72,7 +68,7 @@ class GPUUtilizationTracker(Callback):
        self._evt.set()
        stats = self._queue.get()
        for idx, dev in enumerate(self._devices):
-            self.trainer.monitors.put_scalar('GPUUtil/{}'.format(dev), stats[idx])
+            self.trainer.monitors.put_scalar('GPUUtil/{:.2f}'.format(dev), stats[idx])

    def _after_train(self):
        self._stop_evt.set()
@@ -88,23 +84,24 @@ class GPUUtilizationTracker(Callback):

            stats = np.zeros((len(self._devices),), dtype='f4')
            cnt = 0
-            while True:
-                time.sleep(1)
-                output, retv = subproc_call(self._command)
-                assert retv == 0, "Cannot fetch GPU Utilization!"
-                data = list(map(float, output.strip().split(b'\n')))
-                stats += data
-                cnt += 1
-
-                if evt.is_set():    # stop epoch
-                    if stop_evt.is_set():   # or on exit
-                        return
-                    evt.clear()
-                    # Ignore the last datapoint. Usually is zero, makes us underestimate the util.
-                    stats -= data
-                    cnt -= 1
-                    rst_queue.put(stats / cnt)
-                    break
+            with NVMLContext() as ctx:
+                while True:
+                    time.sleep(1)
+
+                    data = [ctx.device(i).utilization()['gpu'] for i in self._devices]
+                    data = list(map(float, data))
+                    stats += data
+                    cnt += 1
+
+                    if evt.is_set():    # stop epoch
+                        if stop_evt.is_set():   # or on exit
+                            return
+                        evt.clear()
+                        # Ignore the last datapoint. Usually is zero, makes us underestimate the util.
+                        stats -= data
+                        cnt -= 1
+                        rst_queue.put(stats / cnt)
+                        break


 # Can add more features from tfprof