Commit a5070b4e authored by Yuxin Wu's avatar Yuxin Wu

use NVML for GPUUtilizationTracker

parent 5667a220
......@@ -17,29 +17,29 @@ You can start using `tf.layers` today as long as it fits your need.
The following code:
```python
with argscope(Conv2D, out_channel=32, kernel_shape=3, nl=tf.nn.relu):
with argscope(Conv2D, filters=32, kernel_size=3, activation=tf.nn.relu):
l = (LinearWrap(image) # the starting brace is only for line-breaking
.Conv2D('conv0')
.MaxPooling('pool0', 2)
.Conv2D('conv1', padding='SAME')
.Conv2D('conv2', kernel_shape=5)
.FullyConnected('fc0', 512, nl=tf.nn.relu)
.Dropout('dropout', 0.5)
.Conv2D('conv2', kernel_size=5)
.FullyConnected('fc0', 512, activation=tf.nn.relu)
.Dropout('dropout', rate=0.5)
.tf.multiply(0.5)
.apply(func, *args, **kwargs)
.FullyConnected('fc1', out_dim=10, nl=tf.identity)())
.FullyConnected('fc1', units=10, activation=tf.identity)())
```
is equivalent to:
```
l = Conv2D('conv0', image, 32, 3, nl=tf.nn.relu)
l = Conv2D('conv0', image, 32, 3, activation=tf.nn.relu)
l = MaxPooling('pool0', l, 2)
l = Conv2D('conv1', l, 32, 3, padding='SAME', nl=tf.nn.relu)
l = Conv2D('conv2', l, 32, 5, nl=tf.nn.relu)
l = FullyConnected('fc0', l, 512, nl=tf.nn.relu)
l = Dropout('dropout', l, 0.5)
l = Conv2D('conv1', l, 32, 3, padding='SAME', activation=tf.nn.relu)
l = Conv2D('conv2', l, 32, 5, activation=tf.nn.relu)
l = FullyConnected('fc0', l, 512, activation=tf.nn.relu)
l = Dropout('dropout', l, rate=0.5)
l = tf.multiply(l, 0.5)
l = func(l, *args, **kwargs)
l = FullyConnected('fc1', l, 10, nl=tf.identity)
l = FullyConnected('fc1', l, 10, activation=tf.identity)
```
### Access Relevant Tensors
......
......@@ -13,8 +13,9 @@ from tensorflow.python.client import timeline
from .base import Callback
from ..utils import logger
from ..utils.concurrency import ensure_proc_terminate, subproc_call, start_proc_mask_signal
from ..utils.concurrency import ensure_proc_terminate, start_proc_mask_signal
from ..utils.gpu import get_nr_gpu
from ..utils.nvml import NVMLContext
__all__ = ['GPUUtilizationTracker', 'GraphProfiler', 'PeakMemoryTracker']
......@@ -37,23 +38,18 @@ class GPUUtilizationTracker(Callback):
if devices is None:
env = os.environ.get('CUDA_VISIBLE_DEVICES')
if env is None:
self._devices = list(map(str, range(get_nr_gpu())))
self._devices = list(range(get_nr_gpu()))
logger.warn("[GPUUtilizationTracker] Both devices and CUDA_VISIBLE_DEVICES are None! "
"Will monitor all {} visible GPUs!".format(len(self._devices)))
else:
if len(env):
self._devices = env.split(',')
self._devices = list(map(int, env.split(',')))
else:
self._devices = []
else:
self._devices = list(map(str, devices))
self._devices = devices
assert len(self._devices), "[GPUUtilizationTracker] No GPU device given!"
self._command = "nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i " + \
','.join(self._devices)
_, ret = subproc_call(self._command)
assert ret == 0, "Cannot fetch GPU utilization!"
def _before_train(self):
self._evt = mp.Event()
self._stop_evt = mp.Event()
......@@ -72,7 +68,7 @@ class GPUUtilizationTracker(Callback):
self._evt.set()
stats = self._queue.get()
for idx, dev in enumerate(self._devices):
self.trainer.monitors.put_scalar('GPUUtil/{}'.format(dev), stats[idx])
self.trainer.monitors.put_scalar('GPUUtil/{:.2f}'.format(dev), stats[idx])
def _after_train(self):
self._stop_evt.set()
......@@ -88,23 +84,24 @@ class GPUUtilizationTracker(Callback):
stats = np.zeros((len(self._devices),), dtype='f4')
cnt = 0
while True:
time.sleep(1)
output, retv = subproc_call(self._command)
assert retv == 0, "Cannot fetch GPU Utilization!"
data = list(map(float, output.strip().split(b'\n')))
stats += data
cnt += 1
if evt.is_set(): # stop epoch
if stop_evt.is_set(): # or on exit
return
evt.clear()
# Ignore the last datapoint. Usually is zero, makes us underestimate the util.
stats -= data
cnt -= 1
rst_queue.put(stats / cnt)
break
with NVMLContext() as ctx:
while True:
time.sleep(1)
data = [ctx.device(i).utilization()['gpu'] for i in self._devices]
data = list(map(float, data))
stats += data
cnt += 1
if evt.is_set(): # stop epoch
if stop_evt.is_set(): # or on exit
return
evt.clear()
# Ignore the last datapoint. Usually is zero, makes us underestimate the util.
stats -= data
cnt -= 1
rst_queue.put(stats / cnt)
break
# Can add more features from tfprof
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment