Add the compression parameter to Horovod trainer (#1050)

* Add the compression parameter to Horovod trainer * fixed typo * Change compression default value of HorovodTrainer to None * Check horovod version * change import * fix version check * fix code for PEP8 requirements * Update trainers.py

Add the compression parameter to Horovod trainer (#1050)
* Add the compression parameter to Horovod trainer * fixed typo * Change compression default value of HorovodTrainer to None * Check horovod version * change import * fix version check * fix code for PEP8 requirements * Update trainers.py
99565998 · Yen-Chen Liu · Yuxin Wu · a9dce5b2 · 99565998
Commit 99565998 authored Jan 17, 2019 by Yen-Chen Liu Committed by Yuxin Wu Jan 16, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 2 deletions

tensorpack/train/trainers.py tensorpack/train/trainers.py +10 -2

No files found.
--- a/tensorpack/train/trainers.py
+++ b/tensorpack/train/trainers.py
@@ -370,24 +370,29 @@ class HorovodTrainer(SingleCostTrainer):
           for a full example which has handled these common issues.
           This example can train ImageNet in roughly an hour following the paper's setup.
    """
-    def __init__(self, average=True):
+    def __init__(self, average=True, compression=None):
        """
        Args:
            average (bool): whether to average or sum the gradients across processes.
+            compression: `hvd.Compression.fp16` or `hvd.Compression.none`
        """
        if 'pyarrow' in sys.modules:
            logger.warn("Horovod and pyarrow may conflict due to pyarrow bugs. "
                        "Uninstall pyarrow and use msgpack instead.")
        # lazy import
        import horovod.tensorflow as _hvd
+        import horovod
        global hvd
        hvd = _hvd
+        hvd_version = tuple(map(int, horovod.__version__.split('.')))
        hvd.init()
        self.is_chief = hvd.rank() == 0
        self._local_rank = hvd.local_rank()
        self._rank = hvd.rank()
        self._average = average
+        self._compression = compression
+        self._has_compression = hvd_version >= (0, 15, 0)
        logger.info("[HorovodTrainer] local rank={}".format(self._local_rank))
        super(HorovodTrainer, self).__init__()
@@ -399,7 +404,10 @@ class HorovodTrainer(SingleCostTrainer):
        with tf.name_scope("HVDAllReduce"):
            for grad, var in grads:
                if grad is not None:
-                    avg_grad = hvd.allreduce(grad, average=self._average)
+                    if self._compression is not None and self._has_compression:
+                        avg_grad = hvd.allreduce(grad, average=self._average, compression=self._compression)
+                    else:
+                        avg_grad = hvd.allreduce(grad, average=self._average)
                    averaged_gradients.append((avg_grad, var))
                else:
                    averaged_gradients.append((None, var))