Commit 07e28eea authored by Yuxin Wu's avatar Yuxin Wu

update about NVML and environment

parent 62ea40c8
...@@ -160,7 +160,8 @@ or when you need to filter your data on the fly. ...@@ -160,7 +160,8 @@ or when you need to filter your data on the fly.
but inefficient for generic data type or numpy arrays. but inefficient for generic data type or numpy arrays.
Also, its implementation [does not always clean up the subprocesses correctly](https://github.com/pytorch/pytorch/issues/16608). Also, its implementation [does not always clean up the subprocesses correctly](https://github.com/pytorch/pytorch/issues/16608).
PyTorch starts to improve on these bad assumptions (e.g., with [IterableDataset](https://github.com/pytorch/pytorch/pull/19228)). PyTorch starts to improve on bad assumptions 1-3, (e.g., with IterableDataset).
But the interface still bears the history of these assumptions.
On the other hand, DataFlow: On the other hand, DataFlow:
1. Is an iterator, not necessarily has a length or can be indexed. This is more generic. 1. Is an iterator, not necessarily has a length or can be indexed. This is more generic.
......
...@@ -173,7 +173,7 @@ class MySimulatorMaster(SimulatorMaster, Callback): ...@@ -173,7 +173,7 @@ class MySimulatorMaster(SimulatorMaster, Callback):
try: try:
distrib, value = outputs.result() distrib, value = outputs.result()
except CancelledError: except CancelledError:
logger.info("Client {} cancelled.".format(client.ident)) logger.info("Client {} cancelled.".format(client.ident.decode('utf-8')))
return return
assert np.all(np.isfinite(distrib)), distrib assert np.all(np.isfinite(distrib)), distrib
action = np.random.choice(len(distrib), p=distrib) action = np.random.choice(len(distrib), p=distrib)
......
...@@ -116,7 +116,7 @@ Performance in [Detectron](https://github.com/facebookresearch/Detectron/) can b ...@@ -116,7 +116,7 @@ Performance in [Detectron](https://github.com/facebookresearch/Detectron/) can b
We compare models that have identical training & inference cost between the two implementations. We compare models that have identical training & inference cost between the two implementations.
Their numbers can be different due to small implementation details. Their numbers can be different due to small implementation details.
<a id="ft2">2</a>: Our mAP is __7 point__ better than the official model in <a id="ft2">2</a>: This model has __7 point__ better mAP than the official model in
[matterport/Mask_RCNN](https://github.com/matterport/Mask_RCNN/releases/tag/v2.0) which has the same architecture. [matterport/Mask_RCNN](https://github.com/matterport/Mask_RCNN/releases/tag/v2.0) which has the same architecture.
Our implementation is also [5x faster](https://github.com/tensorpack/benchmarks/tree/master/MaskRCNN). Our implementation is also [5x faster](https://github.com/tensorpack/benchmarks/tree/master/MaskRCNN).
......
...@@ -124,10 +124,10 @@ class GPUUtilizationTracker(Callback): ...@@ -124,10 +124,10 @@ class GPUUtilizationTracker(Callback):
Args: Args:
devices (list[int]) devices (list[int])
""" """
try:
with NVMLContext() as ctx: with NVMLContext() as ctx:
devices = [ctx.device(i) for i in devices] devices = [ctx.device(i) for i in devices]
while True: while True:
try:
evt.wait() # start epoch evt.wait() # start epoch
evt.clear() evt.clear()
if stop_evt.is_set(): # or on exit if stop_evt.is_set(): # or on exit
......
...@@ -10,6 +10,7 @@ import psutil ...@@ -10,6 +10,7 @@ import psutil
import tensorflow as tf import tensorflow as tf
import numpy as np import numpy as np
import tensorpack
from ..compat import tfv1 from ..compat import tfv1
from ..utils.argtools import graph_memoized from ..utils.argtools import graph_memoized
from ..utils.utils import find_library_full_path as find_library from ..utils.utils import find_library_full_path as find_library
...@@ -172,10 +173,10 @@ def collect_env_info(): ...@@ -172,10 +173,10 @@ def collect_env_info():
data = [] data = []
data.append(("sys.platform", sys.platform)) data.append(("sys.platform", sys.platform))
data.append(("Python", sys.version.replace("\n", ""))) data.append(("Python", sys.version.replace("\n", "")))
data.append(("Tensorpack", __git_version__)) data.append(("Tensorpack", __git_version__ + " @" + os.path.dirname(tensorpack.__file__)))
data.append(("Numpy", np.__version__)) data.append(("Numpy", np.__version__))
data.append(("TensorFlow", tfv1.VERSION + "/" + tfv1.GIT_VERSION)) data.append(("TensorFlow", tfv1.VERSION + "/" + tfv1.GIT_VERSION + " @" + os.path.dirname(tf.__file__)))
data.append(("TF Compiler Version", tfv1.COMPILER_VERSION)) data.append(("TF Compiler Version", tfv1.COMPILER_VERSION))
has_cuda = tf.test.is_built_with_cuda() has_cuda = tf.test.is_built_with_cuda()
data.append(("TF CUDA support", has_cuda)) data.append(("TF CUDA support", has_cuda))
...@@ -221,7 +222,7 @@ def collect_env_info(): ...@@ -221,7 +222,7 @@ def collect_env_info():
# Other important dependencies: # Other important dependencies:
try: try:
import horovod import horovod
data.append(("Horovod", horovod.__version__)) data.append(("Horovod", horovod.__version__ + " @" + os.path.dirname(horovod.__file__)))
except ImportError: except ImportError:
pass pass
......
...@@ -43,19 +43,29 @@ def get_num_gpu(): ...@@ -43,19 +43,29 @@ def get_num_gpu():
logger.warn(message + "But TensorFlow was not built with CUDA support and could not use GPUs!") logger.warn(message + "But TensorFlow was not built with CUDA support and could not use GPUs!")
return ret return ret
try:
# Use NVML to query device properties
with NVMLContext() as ctx:
nvml_num_dev = ctx.num_devices()
except Exception:
nvml_num_dev = None
env = os.environ.get('CUDA_VISIBLE_DEVICES', None) env = os.environ.get('CUDA_VISIBLE_DEVICES', None)
if env: if env:
return warn_return(len(env.split(',')), "Found non-empty CUDA_VISIBLE_DEVICES. ") num_dev = len(env.split(','))
assert num_dev <= nvml_num_dev, \
"Only {} GPU(s) available, but CUDA_VISIBLE_DEVICES is set to {}".format(nvml_num_dev, env)
return warn_return(num_dev, "Found non-empty CUDA_VISIBLE_DEVICES. ")
output, code = subproc_call("nvidia-smi -L", timeout=5) output, code = subproc_call("nvidia-smi -L", timeout=5)
if code == 0: if code == 0:
output = output.decode('utf-8') output = output.decode('utf-8')
return warn_return(len(output.strip().split('\n')), "Found nvidia-smi. ") return warn_return(len(output.strip().split('\n')), "Found nvidia-smi. ")
try:
# Use NVML to query device properties if nvml_num_dev is not None:
with NVMLContext() as ctx: return warn_return(nvml_num_dev, "NVML found nvidia devices. ")
return warn_return(ctx.num_devices(), "NVML found nvidia devices. ")
except Exception: # Fallback to TF
# Fallback
logger.info("Loading local devices by TensorFlow ...") logger.info("Loading local devices by TensorFlow ...")
try: try:
......
...@@ -191,6 +191,8 @@ class NVMLContext(object): ...@@ -191,6 +191,8 @@ class NVMLContext(object):
Returns: Returns:
NvidiaDevice: single GPU device NvidiaDevice: single GPU device
""" """
num_dev = self.num_devices()
assert idx < num_dev, "Cannot obtain device {}: NVML only found {} devices.".format(idx, num_dev)
class GpuDevice(Structure): class GpuDevice(Structure):
pass pass
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment