remove extra kwargs in TrainConfig

a12872dc · Yuxin Wu · 229e991a · a12872dc · a12872dc · a12872dc
Commit a12872dc authored Sep 03, 2020 by Yuxin Wu
Showing with 9 additions and 15 deletions

tensorpack/callbacks/group.py tensorpack/callbacks/group.py +1 -1

tensorpack/train/config.py tensorpack/train/config.py +1 -2

tensorpack/train/trainers.py tensorpack/train/trainers.py +7 -12

No files found.
--- a/tensorpack/callbacks/group.py
+++ b/tensorpack/callbacks/group.py
@@ -33,7 +33,7 @@ class CallbackTimeLogger(object):
    def log(self):
        """ log the time of some heavy callbacks """
-        if self.tot < 3:
+        if self.tot < 2:
            return
        msgs = []
        for name, t in self.times:

--- a/tensorpack/train/config.py
+++ b/tensorpack/train/config.py
@@ -61,8 +61,7 @@ class TrainConfig(object):
                 model=None,
                 callbacks=None, extra_callbacks=None, monitors=None,
                 session_creator=None, session_config=None, session_init=None,
-                 starting_epoch=1, steps_per_epoch=None, max_epoch=99999,
+                 starting_epoch=1, steps_per_epoch=None, max_epoch=99999):
-                 **kwargs):
        """
        Args:
            dataflow (DataFlow):

--- a/tensorpack/train/trainers.py
+++ b/tensorpack/train/trainers.py
@@ -344,28 +344,19 @@ class HorovodTrainer(SingleCostTrainer):
    .. code-block:: bash
        # First, change trainer to HorovodTrainer(), then
-        CUDA_VISIBLE_DEVICES=0,1,2,3 NCCL_DEBUG=INFO mpirun -np 4 --output-filename mylog python train.py
+        CUDA_VISIBLE_DEVICES=0,1,2,3 NCCL_DEBUG=INFO horovodrun -np 4 --output-filename mylog python train.py
    To use for distributed training:
    .. code-block:: bash
        # First, change trainer to HorovodTrainer(), then
-        mpirun -np 8 -H server1:4,server2:4  \\
+        horovodrun -np 8 -H server1:4,server2:4 --output-filename mylog \\
-            -bind-to none -map-by slot \\
-            --output-filename mylog -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \\
            python train.py
-        # Add other environment variables you need by -x, e.g. PYTHONPATH, PATH.
-        # If using all GPUs, you can always skip the `CUDA_VISIBLE_DEVICES` option.
-        # There are other MPI options that can potentially improve performance especially on special hardwares.
-    Horovod can also be launched without MPI. See
-    `its documentation <https://github.com/horovod/horovod#running-horovod>`_
-    for more details.
    Note:
        1. To reach the maximum speed in your system, there are many options to tune
-           for Horovod installation and in the MPI command line.
+           in Horovod installation, horovodrun arguments, and in the MPI command line.
           See Horovod docs for details.
        2. Due to a TF bug (#8136), you must not initialize CUDA context before the trainer starts training.
@@ -378,6 +369,10 @@ class HorovodTrainer(SingleCostTrainer):
            + MPI does not like `fork()`. If your code (e.g. dataflow) contains multiprocessing, it may cause problems.
            + MPI sometimes fails to kill all processes in the end. Be sure to check it afterwards.
+           The gloo backend is recommended though it may come with very minor slow down.
+           To use gloo backend, see
+           `horovod documentation <https://github.com/horovod/horovod#running-horovod>`_ for more details.
        4. Keep in mind that there is one process running the script per GPU, therefore:
           + Make sure your InputSource has reasonable randomness.