update docs; workaround TF bug

f34f454e · Yuxin Wu · c4a68c6c · f34f454e · f34f454e · f34f454e
Commit f34f454e authored Nov 27, 2018 by Yuxin Wu
6 changed files
--- a/examples/FasterRCNN/README.md
+++ b/examples/FasterRCNN/README.md
@@ -8,7 +8,7 @@ This example provides a minimal (2k lines) and faithful implementation of the fo

 with the support of:
 + Multi-GPU / distributed training
-+ [Cross-GPU BatchNorm](https://arxiv.org/abs/1711.07240)
+ Cross-GPU BatchNorm (from [MegDet: A Large Mini-Batch Object Detector](https://arxiv.org/abs/1711.07240))
 + [Group Normalization](https://arxiv.org/abs/1803.08494)

 ## Dependencies
@@ -56,7 +56,6 @@ Options can be changed by either the command line or the `config.py` file.
 Recommended configurations are listed in the table below.

 The code is only valid for training with 1, 2, 4 or >=8 GPUs.
-Not training with 8 GPUs may result in different performance from the table below.

 ### Inference:

@@ -77,11 +76,13 @@ prediction will need to be run with the corresponding training configs.
 ## Results

 These models are trained on trainval35k and evaluated on minival2014 using mAP@IoU=0.50:0.95.
-All models are fine-tuned from ImageNet pre-trained R50/R101 models in the [model zoo](http://models.tensorpack.com/FasterRCNN/).
+All models are fine-tuned from ImageNet pre-trained R50/R101 models in the
+[model zoo](http://models.tensorpack.com/FasterRCNN/), unless otherwise noted.
+All models are trained with 8 NVIDIA V100s, unless otherwise noted.
 Performance in [Detectron](https://github.com/facebookresearch/Detectron/) can be roughly reproduced.
 Mask R-CNN results contain both box and mask mAP.

- | Backbone | mAP<br/>(box;mask)                                                                                                            | Detectron mAP <sup>[1](#ft1)</sup><br/> (box;mask) | Time on 8 V100s | Configurations <br/> (click to expand)                                                                                                                                                                                                                 |
+ | Backbone | mAP<br/>(box;mask)                                                                                                            | Detectron mAP <sup>[1](#ft1)</sup><br/> (box;mask) | Time (on 8 V100s) | Configurations <br/> (click to expand)                                                                                                                                                                                                                 |
 | -        | -                                                                                                                             | -                                                  | -               | -                                                                                                                                                                                                                                                      |
 | R50-C4   | 33.1                                                                                                                          |                                                    | 18h             | <details><summary>super quick</summary>`MODE_MASK=False FRCNN.BATCH_PER_IM=64`<br/>`PREPROC.SHORT_EDGE_SIZE=600 PREPROC.MAX_SIZE=1024`<br/>`TRAIN.LR_SCHEDULE=[150000,230000,280000]` </details>                                                       |
 | R50-C4   | 36.6                                                                                                                          | 36.5                                               | 44h             | <details><summary>standard</summary>`MODE_MASK=False` </details>                                                                                                                                                                                       |

--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ setup(
        "tabulate>=0.7.7",
        "tqdm>4.11.1",
        "msgpack>=0.5.2",
-        "msgpack-numpy>=0.4.0",
+        "msgpack-numpy>=0.4.4.2",
        "pyzmq>=16",
        "subprocess32; python_version < '3.0'",
        "functools32; python_version < '3.0'",

--- a/tensorpack/callbacks/prof.py
+++ b/tensorpack/callbacks/prof.py
@@ -53,7 +53,7 @@ class GPUUtilizationTracker(Callback):
        assert len(self._devices), "[GPUUtilizationTracker] No GPU device given!"

    def _before_train(self):
-        assert tf.test.is_gpu_available()
+        # assert tf.test.is_gpu_available()
        self._evt = mp.Event()
        self._stop_evt = mp.Event()
        self._queue = mp.Queue()
@@ -213,7 +213,7 @@ class PeakMemoryTracker(Callback):
        self._fetches = tf.train.SessionRunArgs(fetches=ops)

    def _before_run(self, _):
-        assert tf.test.is_gpu_available(), "PeakMemoryTracker only supports GPU!"
+        # assert tf.test.is_gpu_available(), "PeakMemoryTracker only supports GPU!"
        if self.local_step == self.trainer.steps_per_epoch - 1:
            return self._fetches
        return None

--- a/tensorpack/train/trainers.py
+++ b/tensorpack/train/trainers.py
@@ -310,7 +310,7 @@ class HorovodTrainer(SingleCostTrainer):
    .. code-block:: bash

        # First, change trainer to HorovodTrainer(), then
-        CUDA_VISIBLE_DEVICES=0,1,2,3 mpirun -np 4 --output-filename mylog python train.py
+        CUDA_VISIBLE_DEVICES=0,1,2,3 NCCL_DEBUG=INFO mpirun -np 4 --output-filename mylog python train.py

    To use for distributed training:

@@ -319,7 +319,7 @@ class HorovodTrainer(SingleCostTrainer):
        # First, change trainer to HorovodTrainer(), then
        mpirun -np 8 -H server1:4,server2:4  \\
            -bind-to none -map-by slot \\
-            --output-filename mylog  -x LD_LIBRARY_PATH \\
+            --output-filename mylog -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \\
            python train.py
        # Add other environment variables you need by -x, e.g. PYTHONPATH, PATH.
        # If using all GPUs, you can always skip the `CUDA_VISIBLE_DEVICES` option.
@@ -430,8 +430,8 @@ class HorovodTrainer(SingleCostTrainer):
        except AttributeError:  # old horovod does not have local_size
            pass
        super(HorovodTrainer, self).initialize(session_creator, session_init)
-        if not tf.test.is_gpu_available():
-            logger.error("tf.test.is_gpu_available() == False")
+        # if not tf.test.is_gpu_available():
+        # logger.error("tf.test.is_gpu_available() == False")

        # This broadcast belongs to the "intialize" stage
        # It should not be delayed to the "before_train" stage.

--- a/tensorpack/utils/concurrency.py
+++ b/tensorpack/utils/concurrency.py
@@ -185,7 +185,8 @@ def enable_death_signal(_warn=True):
        import prctl    # pip install python-prctl
    except ImportError:
        if _warn:
-            log_once('Install python-prctl so that processes can be cleaned with guarantee.', 'warn')
+            log_once('"import prctl" failed! Install python-prctl so that processes can be cleaned with guarantee.',
+                     'warn')
        return
    else:
        assert hasattr(prctl, 'set_pdeathsig'), \

--- a/tensorpack/utils/serialize.py
+++ b/tensorpack/utils/serialize.py
@@ -65,6 +65,7 @@ try:
    import msgpack
    import msgpack_numpy
    msgpack_numpy.patch()
+    assert msgpack.version >= (0, 5, 2)
 except ImportError:
    loads_msgpack = create_dummy_func(  # noqa
        'loads_msgpack', ['msgpack', 'msgpack_numpy'])