replicated trainer aggregation mode heuristics

83e4e187 · Yuxin Wu · 0d7e71df · 83e4e187 · 83e4e187 · 83e4e187
Commit 83e4e187 authored Mar 14, 2018 by Yuxin Wu
4 changed files
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ It's Yet Another TF wrapper, but different in:

 1. Focus on __training speed__.
 	+	Speed comes for free with tensorpack -- it uses TensorFlow in the __efficient way__ with no extra overhead.
-	  On different CNNs, it runs [1.2~4x faster](https://github.com/tensorpack/benchmarks/tree/master/other-wrappers) than the equivalent Keras code.
+	  On different CNNs, it runs [1.2~5x faster](https://github.com/tensorpack/benchmarks/tree/master/other-wrappers) than the equivalent Keras code.

 	+ Data-parallel multi-GPU training is off-the-shelf to use. It scales as well as Google's [official benchmark](https://www.tensorflow.org/performance/benchmarks).


--- a/tensorpack/graph_builder/training.py
+++ b/tensorpack/graph_builder/training.py
@@ -223,13 +223,15 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
            if self._mode == 'nccl':
                all_grads = allreduce_grads(all_grads, average=self._average)  # #gpu x #param x 2
            else:
-                # all_grads = allreduce_grads_hierarchical(all_grads, raw_devices, average=self._average)
-
                packer = GradientPacker(len(raw_devices))
-                packer.compute_strategy(all_grads[0])
-                packed_grads = packer.pack_all(all_grads, raw_devices)
-                packed_grads_aggr = allreduce_grads_hierarchical(packed_grads, raw_devices, average=self._average)
-                all_grads = packer.unpack_all(packed_grads_aggr, raw_devices)
+                succ = packer.compute_strategy(all_grads[0])
+                if succ:
+                    packed_grads = packer.pack_all(all_grads, raw_devices)
+                    packed_grads_aggr = allreduce_grads_hierarchical(
+                        packed_grads, raw_devices, average=self._average)
+                    all_grads = packer.unpack_all(packed_grads_aggr, raw_devices)
+                else:
+                    all_grads = allreduce_grads_hierarchical(all_grads, raw_devices, average=self._average)

            self.grads = merge_grad_list(all_grads, all_vars)
        elif self._mode == 'cpu':

--- a/tensorpack/graph_builder/utils.py
+++ b/tensorpack/graph_builder/utils.py
@@ -312,16 +312,24 @@ class GradientPacker(object):

    @call_only_once
    def compute_strategy(self, grads):
+        """
+        Returns:
+            bool - False if grads cannot be packed due to various reasons.
+        """
        for g in grads:
            assert g.shape.is_fully_defined(), "Shape of {} is {}!".format(g.name, g.shape)

        self._shapes = [g.shape for g in grads]
        self._sizes = [g.shape.num_elements() for g in grads]
        self._total_size = sum(self._sizes)
-        assert self._total_size > self._num_split
+        if self._total_size / self._num_split < 1024:
+            logger.info("Skip GradientPacker due to too few gradients.")
+            return False
        # should have the same dtype
        dtypes = set([g.dtype for g in grads])
-        assert len(dtypes) == 1, dtypes
+        if len(dtypes) != 1:
+            logger.info("Skip GradientPacker due to inconsistent gradient types.")
+            return False

        split_size = self._total_size // self._num_split
        split_size_last = self._total_size - split_size * (self._num_split - 1)
@@ -329,6 +337,7 @@ class GradientPacker(object):
        logger.info(
            "Will pack {} gradients of total number={} into {} splits.".format(
                len(self._sizes), self._total_size, self._num_split))
+        return True

    def pack(self, grads):
        """

--- a/tensorpack/train/trainers.py
+++ b/tensorpack/train/trainers.py
@@ -146,17 +146,19 @@ class SyncMultiGPUTrainerReplicated(SingleCostTrainer):
            gpus (int or [int]): list of GPU ids.
            average (bool): whether to average or sum gradients.
            mode (str or None): Gradient aggregation mode.
-                These methods may have slight differences in speed.
-                Supported values: ['nccl', 'cpu']. Default to pick
-                automatically by heuristics.
+                Supported values: ['nccl', 'hierarchical', 'cpu'].
+                Default to pick automatically by heuristics.
+                These modes may have slight (within 5%) differences in speed.
        """
        self.devices = gpus
+
        if use_nccl is not None:
-            mode = 'nccl' if use_nccl else 'cpu'
+            mode = 'nccl' if use_nccl else None
            logger.warn("use_nccl option was deprecated! Use the `mode` option instead!")
        if mode is None:
-            mode = 'nccl'
+            mode = 'hierarchical' if len(gpus) >= 8 else 'nccl'
        mode = mode.lower()
+
        self._builder = SyncMultiGPUReplicatedBuilder(gpus, average, mode)
        super(SyncMultiGPUTrainerReplicated, self).__init__()