update docs

2d661d6d · Yuxin Wu · 379e9a07 · 2d661d6d · 2d661d6d · 2d661d6d
Commit 2d661d6d authored Aug 18, 2020 by Yuxin Wu
6 changed files
--- a/.github/ISSUE_TEMPLATE/unexpected-problems---bugs.md
+++ b/.github/ISSUE_TEMPLATE/unexpected-problems---bugs.md
@@ -50,10 +50,10 @@ If you expect higher speed, please read
 http://tensorpack.readthedocs.io/tutorial/performance-tuning.html
 before posting.
-If you expect the model to converge / work better, note that we do not help you on how to train a new model.
+If you expect the model to converge / work better, note that we do not help you on how to improve a model.
 Only in one of the two conditions can we help with it:
 (1) You're unable to reproduce the results documented in tensorpack examples.
-(2) It appears to be a tensorpack bug.
+(2) It indicates a tensorpack bug.
 ### 4. Your environment:

--- a/examples/FasterRCNN/NOTES.md
+++ b/examples/FasterRCNN/NOTES.md
@@ -48,7 +48,7 @@ This is a minimal implementation that simply contains these files:
 3. We currently only support single image per GPU in this example.
-4. Because of (3), BatchNorm statistics are supposed to be freezed during fine-tuning.
+4. Because of (3), BatchNorm statistics are supposed to be frozen during fine-tuning.
 5. An alternative to freezing BatchNorm is to sync BatchNorm statistics across
   GPUs (the `BACKBONE.NORM=SyncBN` option).

--- a/examples/FasterRCNN/train.py
+++ b/examples/FasterRCNN/train.py
@@ -115,6 +115,5 @@ if __name__ == '__main__':
    if is_horovod:
        trainer = HorovodTrainer(average=False)
    else:
-        # nccl mode appears faster than cpu mode
        trainer = SyncMultiGPUTrainerReplicated(cfg.TRAIN.NUM_GPUS, average=False)
    launch_train_with_config(traincfg, trainer)
--- a/tensorpack/graph_builder/training.py
+++ b/tensorpack/graph_builder/training.py
@@ -211,8 +211,7 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
        self._mode = mode
        if self._mode == 'hierarchical' and len(towers) != 8:
-            logger.warn("mode='hierarchical' require 8 GPUs. Fallback to mode='nccl'.")
+            raise ValueError("mode='hierarchical' require 8 GPUs.")
-            self._mode = 'nccl'
    def call_for_each_tower(self, tower_fn):
        """

--- a/tensorpack/models/batch_norm.py
+++ b/tensorpack/models/batch_norm.py
@@ -75,6 +75,9 @@ def get_sync_bn_mean_var(inputs, red_axis, sync_statistics):
            assert TF_version >= (1, 10), \
                "Cross-GPU BatchNorm is only supported in TF>=1.10 ." \
                "Upgrade TF or apply this patch manually: https://github.com/tensorflow/tensorflow/pull/20360"
+            if TF_version >= (1, 15):
+                logger.warn("BatchNorm(sync_statistics='nccl') may produce incorrect results due "
+                            "to bug in TF>=1.15: https://github.com/tensorflow/tensorflow/issues/41539")
            if TF_version <= (1, 12):
                try:

--- a/tensorpack/train/trainers.py
+++ b/tensorpack/train/trainers.py
@@ -168,10 +168,10 @@ class SyncMultiGPUTrainerReplicated(SingleCostTrainer):
            gpus (int or [int]): list of GPU ids.
            average (bool): whether to average or sum gradients.
            mode (str or None): Gradient aggregation mode.
-                Supported values: ['nccl', 'hierarchical', 'cpu'].
+                Supported values: ['nccl', 'hierarchical', 'cpu', 'gpu'].
+                These modes may differ in speed.
                Default to pick automatically by heuristics.
-                These modes may have slight (within 5%) differences in speed.
+                "hierarchical" mode was designed for DGX-like 8-GPU machines.
-                "hierarchical" mode was designed for DGX-like 8GPU machines.
        """
        self.devices = gpus
        if mode is not None: