fix #653, fix #655

68edaa0c · Yuxin Wu · 3398df09 · 68edaa0c · 68edaa0c · 68edaa0c
Commit 68edaa0c authored Feb 14, 2018 by Yuxin Wu
4 changed files
--- a/examples/ResNet/imagenet-resnet.py
+++ b/examples/ResNet/imagenet-resnet.py
@@ -61,6 +61,7 @@ def get_data(name, batch):
 def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
+    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower
    if fake:
@@ -73,14 +74,14 @@ def get_config(model, fake=False):
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)
-        BASE_LR = 0.1 * (args.batch // 256)
+        BASE_LR = 0.1 * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter(
                'learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2),
                                  (85, BASE_LR * 1e-3), (95, BASE_LR * 1e-4), (105, BASE_LR * 1e-5)]),
        ]
-        if BASE_LR != 0.1:
+        if BASE_LR > 0.1:
            callbacks.append(
                ScheduledHyperParamSetter(
                    'learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear'))
@@ -115,7 +116,7 @@ if __name__ == '__main__':
    parser.add_argument('-d', '--depth', help='resnet depth',
                        type=int, default=18, choices=[18, 34, 50, 101, 152])
    parser.add_argument('--eval', action='store_true')
-    parser.add_argument('--batch', help='total batch size. need to be multiple of 256 to get similar accuracy.',
+    parser.add_argument('--batch', help='total batch size. 256 gives best accuracy.',
                        default=256, type=int)
    parser.add_argument('--mode', choices=['resnet', 'preact', 'se'],
                        help='variants of resnet to use', default='resnet')

--- a/tensorpack/input_source/input_source.py
+++ b/tensorpack/input_source/input_source.py
@@ -493,9 +493,10 @@ class StagingInput(FeedfreeInput):
                fetches=[self.stage_op, unstage_op])
        def _prefill(self):
-            logger.info("Pre-filling staging area ...")
+            logger.info("Pre-filling StagingArea ...")
            for k in range(self.nr_stage):
                self.stage_op.run()
+            logger.info("Put {} element(s) to StagingArea.")
        def _before_run(self, ctx):
            # This has to happen once, right before the first iteration.

--- a/tensorpack/train/trainers.py
+++ b/tensorpack/train/trainers.py
@@ -4,6 +4,7 @@
 import os
 import tensorflow as tf
+import multiprocessing as mp
 from ..callbacks import RunOp
 from ..tfutils.sesscreate import NewSessionCreator
@@ -339,8 +340,10 @@ class HorovodTrainer(SingleCostTrainer):
        # NOTE It will fail if GPU was already detected before initializing the session
        # https://github.com/tensorflow/tensorflow/issues/8136
        session_creator.config.gpu_options.visible_device_list = str(self._local_rank)
-        # TODO split #CPUs
+        try:
-        # session_creator.config.inter_op_parallelism_threads =
+            session_creator.config.inter_op_parallelism_threads = mp.cpu_count() // hvd.local_size()
+        except AttributeError:
+            pass
        super(HorovodTrainer, self).initialize(
            session_creator, session_init)

--- a/tensorpack/utils/serialize.py
+++ b/tensorpack/utils/serialize.py
@@ -8,11 +8,15 @@ import msgpack_numpy
 msgpack_numpy.patch()
 try:
+    # https://github.com/apache/arrow/pull/1223#issuecomment-359895666
    import sys
+    old_mod = sys.modules.get('torch', None)
    sys.modules['torch'] = None
-    # https://github.com/apache/arrow/pull/1223#issuecomment-359895666
    import pyarrow as pa
-    del sys.modules['torch']
+    if old_mod is not None:
+        sys.modules['torch'] = old_mod
+    else:
+        del sys.modules['torch']
 except ImportError:
    pa = None