Commit 56a77747 authored by Yuxin Wu's avatar Yuxin Wu

Heuristics for progress bar refresh interval

parent 3a0c5e9a
......@@ -33,6 +33,7 @@ paper](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convo
Trained with 64x2 batch size, the script reaches 58% single-crop validation
accuracy after 100 epochs (21 hours on 2 V100s).
It also puts in tensorboard the first-layer filter visualizations similar to the paper.
See `./alexnet.py --help` for usage.
### Inception-BN, VGG16
......@@ -43,6 +44,7 @@ is a bit vague on these details.
This VGG16 script, when trained with 32x8 batch size, reaches the following
validation error after 100 epochs (30h with 8 P100s). This is the code for the VGG
experiments in the paper [Group Normalization](https://arxiv.org/abs/1803.08494).
See `./vgg16.py --help` for usage.
| No Normalization | Batch Normalization | Group Normalization |
|:------------------------------------------|---------------------|--------------------:|
......
......@@ -126,7 +126,6 @@ if __name__ == '__main__':
parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
parser.add_argument('--data', help='ILSVRC dataset dir')
parser.add_argument('--batch', type=int, default=32, help='batch per GPU')
parser.add_argument('--load', help='load model')
args = parser.parse_args()
if args.gpu:
......
......@@ -8,7 +8,7 @@ import os
import tensorflow as tf
from tensorpack import *
from tensorpack.tfutils import argscope, get_model_loader
from tensorpack.tfutils import argscope
from tensorpack.tfutils.summary import *
from tensorpack.utils.gpu import get_nr_gpu
......@@ -151,7 +151,6 @@ if __name__ == '__main__':
parser.add_argument('--data', help='ILSVRC dataset dir')
parser.add_argument('--batch', type=int, default=32, help='batch per GPU')
parser.add_argument('--norm', choices=['none', 'bn', 'gn'], default='none')
parser.add_argument('--load', help='load model')
args = parser.parse_args()
if args.gpu:
......@@ -160,8 +159,6 @@ if __name__ == '__main__':
logger.set_logger_dir(os.path.join('train_log', 'vgg16-norm={}'.format(args.norm)))
config = get_config()
if args.load:
config.session_init = get_model_loader(args.load)
nr_tower = max(get_nr_gpu(), 1)
trainer = SyncMultiGPUTrainerReplicated(nr_tower)
launch_train_with_config(config, trainer)
......@@ -170,9 +170,13 @@ def get_tqdm_kwargs(**kwargs):
bar_format='{l_bar}{bar}|{n_fmt}/{total_fmt}[{elapsed}<{remaining},{rate_noinv_fmt}]'
)
try:
# Use this env var to override the refresh interval setting
interval = float(os.environ['TENSORPACK_PROGRESS_REFRESH'])
except KeyError:
f = kwargs.get('file', sys.stderr)
isatty = f.isatty()
# NOTE when run under mpirun/slurm, isatty is always False
# Jupyter notebook should be recognized as tty.
# Wait for https://github.com/ipython/ipykernel/issues/268
try:
......@@ -183,10 +187,21 @@ def get_tqdm_kwargs(**kwargs):
pass
if isatty:
default['mininterval'] = 0.5
interval = 0.5
else:
# When run under mpirun/slurm, isatty is always False.
# Here we apply some hacky heuristics for slurm.
if 'SLURM_JOB_ID' in os.environ:
if int(os.environ.get('SLURM_JOB_NUM_NODES', 1)) > 1:
# multi-machine job, probably not interactive
interval = 180
else:
# possibly interactive, so let's be conservative
interval = 15
# If not a tty, don't refresh progress bar that often
default['mininterval'] = 180
interval = 180
default['mininterval'] = interval
default.update(kwargs)
return default
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment