update about NVML and environment

07e28eea · Yuxin Wu · 62ea40c8 · 07e28eea · 07e28eea · 07e28eea
Commit 07e28eea authored Jul 15, 2020 by Yuxin Wu
7 changed files
--- a/docs/tutorial/philosophy/dataflow.md
+++ b/docs/tutorial/philosophy/dataflow.md
@@ -160,7 +160,8 @@ or when you need to filter your data on the fly.
   but inefficient for generic data type or numpy arrays.
   Also, its implementation [does not always clean up the subprocesses correctly](https://github.com/pytorch/pytorch/issues/16608).

-PyTorch starts to improve on these bad assumptions (e.g., with [IterableDataset](https://github.com/pytorch/pytorch/pull/19228)).
+PyTorch starts to improve on bad assumptions 1-3, (e.g., with IterableDataset).
+But the interface still bears the history of these assumptions.
 On the other hand, DataFlow:

 1. Is an iterator, not necessarily has a length or can be indexed. This is more generic.

--- a/examples/A3C-Gym/train-atari.py
+++ b/examples/A3C-Gym/train-atari.py
@@ -173,7 +173,7 @@ class MySimulatorMaster(SimulatorMaster, Callback):
            try:
                distrib, value = outputs.result()
            except CancelledError:
-                logger.info("Client {} cancelled.".format(client.ident))
+                logger.info("Client {} cancelled.".format(client.ident.decode('utf-8')))
                return
            assert np.all(np.isfinite(distrib)), distrib
            action = np.random.choice(len(distrib), p=distrib)

--- a/examples/FasterRCNN/README.md
+++ b/examples/FasterRCNN/README.md
@@ -116,7 +116,7 @@ Performance in [Detectron](https://github.com/facebookresearch/Detectron/) can b
 We compare models that have identical training & inference cost between the two implementations.
 Their numbers can be different due to small implementation details.

- <a id="ft2">2</a>: Our mAP is __7 point__ better than the official model in
+ <a id="ft2">2</a>: This model has __7 point__ better mAP than the official model in
 [matterport/Mask_RCNN](https://github.com/matterport/Mask_RCNN/releases/tag/v2.0) which has the same architecture.
 Our implementation is also [5x faster](https://github.com/tensorpack/benchmarks/tree/master/MaskRCNN).


--- a/tensorpack/callbacks/prof.py
+++ b/tensorpack/callbacks/prof.py
@@ -124,10 +124,10 @@ class GPUUtilizationTracker(Callback):
        Args:
            devices (list[int])
        """
+        try:
            with NVMLContext() as ctx:
                devices = [ctx.device(i) for i in devices]
                while True:
-                try:
                    evt.wait()  # start epoch
                    evt.clear()
                    if stop_evt.is_set():   # or on exit

--- a/tensorpack/tfutils/common.py
+++ b/tensorpack/tfutils/common.py
@@ -10,6 +10,7 @@ import psutil
 import tensorflow as tf
 import numpy as np

+import tensorpack
 from ..compat import tfv1
 from ..utils.argtools import graph_memoized
 from ..utils.utils import find_library_full_path as find_library
@@ -172,10 +173,10 @@ def collect_env_info():
    data = []
    data.append(("sys.platform", sys.platform))
    data.append(("Python", sys.version.replace("\n", "")))
-    data.append(("Tensorpack", __git_version__))
+    data.append(("Tensorpack", __git_version__ + " @" + os.path.dirname(tensorpack.__file__)))
    data.append(("Numpy", np.__version__))

-    data.append(("TensorFlow", tfv1.VERSION + "/" + tfv1.GIT_VERSION))
+    data.append(("TensorFlow", tfv1.VERSION + "/" + tfv1.GIT_VERSION + " @" + os.path.dirname(tf.__file__)))
    data.append(("TF Compiler Version", tfv1.COMPILER_VERSION))
    has_cuda = tf.test.is_built_with_cuda()
    data.append(("TF CUDA support", has_cuda))
@@ -221,7 +222,7 @@ def collect_env_info():
    # Other important dependencies:
    try:
        import horovod
-        data.append(("Horovod", horovod.__version__))
+        data.append(("Horovod", horovod.__version__ + " @" + os.path.dirname(horovod.__file__)))
    except ImportError:
        pass


--- a/tensorpack/utils/gpu.py
+++ b/tensorpack/utils/gpu.py
@@ -43,19 +43,29 @@ def get_num_gpu():
            logger.warn(message + "But TensorFlow was not built with CUDA support and could not use GPUs!")
        return ret

+    try:
+        # Use NVML to query device properties
+        with NVMLContext() as ctx:
+            nvml_num_dev = ctx.num_devices()
+    except Exception:
+        nvml_num_dev = None
+
    env = os.environ.get('CUDA_VISIBLE_DEVICES', None)
    if env:
-        return warn_return(len(env.split(',')), "Found non-empty CUDA_VISIBLE_DEVICES. ")
+        num_dev = len(env.split(','))
+        assert num_dev <= nvml_num_dev, \
+            "Only {} GPU(s) available, but CUDA_VISIBLE_DEVICES is set to {}".format(nvml_num_dev, env)
+        return warn_return(num_dev, "Found non-empty CUDA_VISIBLE_DEVICES. ")
+
    output, code = subproc_call("nvidia-smi -L", timeout=5)
    if code == 0:
        output = output.decode('utf-8')
        return warn_return(len(output.strip().split('\n')), "Found nvidia-smi. ")
-    try:
-        # Use NVML to query device properties
-        with NVMLContext() as ctx:
-            return warn_return(ctx.num_devices(), "NVML found nvidia devices. ")
-    except Exception:
-        # Fallback
+
+    if nvml_num_dev is not None:
+        return warn_return(nvml_num_dev, "NVML found nvidia devices. ")
+
+    # Fallback to TF
    logger.info("Loading local devices by TensorFlow ...")

    try:

--- a/tensorpack/utils/nvml.py
+++ b/tensorpack/utils/nvml.py
@@ -191,6 +191,8 @@ class NVMLContext(object):
        Returns:
            NvidiaDevice: single GPU device
        """
+        num_dev = self.num_devices()
+        assert idx < num_dev, "Cannot obtain device {}: NVML only found {} devices.".format(idx, num_dev)

        class GpuDevice(Structure):
            pass