Heuristics for progress bar refresh interval

56a77747 · Yuxin Wu · 3a0c5e9a · 56a77747 · 56a77747 · 56a77747
Commit 56a77747 authored Jun 07, 2018 by Yuxin Wu
4 changed files
--- a/examples/ImageNetModels/README.md
+++ b/examples/ImageNetModels/README.md
@@ -33,6 +33,7 @@ paper](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convo
 Trained with 64x2 batch size, the script reaches 58% single-crop validation
 accuracy after 100 epochs (21 hours on 2 V100s).
 It also puts in tensorboard the first-layer filter visualizations similar to the paper.
+See `./alexnet.py --help` for usage.
 ### Inception-BN, VGG16
@@ -43,6 +44,7 @@ is a bit vague on these details.
 This VGG16 script, when trained with 32x8 batch size, reaches the following
 validation error after 100 epochs (30h with 8 P100s). This is the code for the VGG
 experiments in the paper [Group Normalization](https://arxiv.org/abs/1803.08494).
+See `./vgg16.py --help` for usage.
 | No Normalization                          | Batch Normalization | Group Normalization |
 |:------------------------------------------|---------------------|--------------------:|

--- a/examples/ImageNetModels/alexnet.py
+++ b/examples/ImageNetModels/alexnet.py
@@ -126,7 +126,6 @@ if __name__ == '__main__':
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('--data', help='ILSVRC dataset dir')
    parser.add_argument('--batch', type=int, default=32, help='batch per GPU')
-    parser.add_argument('--load', help='load model')
    args = parser.parse_args()
    if args.gpu:

--- a/examples/ImageNetModels/vgg16.py
+++ b/examples/ImageNetModels/vgg16.py
@@ -8,7 +8,7 @@ import os
 import tensorflow as tf
 from tensorpack import *
-from tensorpack.tfutils import argscope, get_model_loader
+from tensorpack.tfutils import argscope
 from tensorpack.tfutils.summary import *
 from tensorpack.utils.gpu import get_nr_gpu
@@ -151,7 +151,6 @@ if __name__ == '__main__':
    parser.add_argument('--data', help='ILSVRC dataset dir')
    parser.add_argument('--batch', type=int, default=32, help='batch per GPU')
    parser.add_argument('--norm', choices=['none', 'bn', 'gn'], default='none')
-    parser.add_argument('--load', help='load model')
    args = parser.parse_args()
    if args.gpu:
@@ -160,8 +159,6 @@ if __name__ == '__main__':
    logger.set_logger_dir(os.path.join('train_log', 'vgg16-norm={}'.format(args.norm)))
    config = get_config()
-    if args.load:
-        config.session_init = get_model_loader(args.load)
    nr_tower = max(get_nr_gpu(), 1)
    trainer = SyncMultiGPUTrainerReplicated(nr_tower)
    launch_train_with_config(config, trainer)
--- a/tensorpack/utils/utils.py
+++ b/tensorpack/utils/utils.py
@@ -170,9 +170,13 @@ def get_tqdm_kwargs(**kwargs):
        bar_format='{l_bar}{bar}|{n_fmt}/{total_fmt}[{elapsed}<{remaining},{rate_noinv_fmt}]'
    )
+    try:
+        # Use this env var to override the refresh interval setting
+        interval = float(os.environ['TENSORPACK_PROGRESS_REFRESH'])
+    except KeyError:
        f = kwargs.get('file', sys.stderr)
        isatty = f.isatty()
-    # NOTE when run under mpirun/slurm, isatty is always False
        # Jupyter notebook should be recognized as tty.
        # Wait for https://github.com/ipython/ipykernel/issues/268
        try:
@@ -183,10 +187,21 @@ def get_tqdm_kwargs(**kwargs):
            pass
        if isatty:
-        default['mininterval'] = 0.5
+            interval = 0.5
        else:
+            # When run under mpirun/slurm, isatty is always False.
+            # Here we apply some hacky heuristics for slurm.
+            if 'SLURM_JOB_ID' in os.environ:
+                if int(os.environ.get('SLURM_JOB_NUM_NODES', 1)) > 1:
+                    # multi-machine job, probably not interactive
+                    interval = 180
+                else:
+                    # possibly interactive, so let's be conservative
+                    interval = 15
            # If not a tty, don't refresh progress bar that often
-        default['mininterval'] = 180
+            interval = 180
+    default['mininterval'] = interval
    default.update(kwargs)
    return default