Commit 56a77747 authored by Yuxin Wu's avatar Yuxin Wu

Heuristics for progress bar refresh interval

parent 3a0c5e9a
...@@ -33,6 +33,7 @@ paper](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convo ...@@ -33,6 +33,7 @@ paper](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convo
Trained with 64x2 batch size, the script reaches 58% single-crop validation Trained with 64x2 batch size, the script reaches 58% single-crop validation
accuracy after 100 epochs (21 hours on 2 V100s). accuracy after 100 epochs (21 hours on 2 V100s).
It also puts in tensorboard the first-layer filter visualizations similar to the paper. It also puts in tensorboard the first-layer filter visualizations similar to the paper.
See `./alexnet.py --help` for usage.
### Inception-BN, VGG16 ### Inception-BN, VGG16
...@@ -43,6 +44,7 @@ is a bit vague on these details. ...@@ -43,6 +44,7 @@ is a bit vague on these details.
This VGG16 script, when trained with 32x8 batch size, reaches the following This VGG16 script, when trained with 32x8 batch size, reaches the following
validation error after 100 epochs (30h with 8 P100s). This is the code for the VGG validation error after 100 epochs (30h with 8 P100s). This is the code for the VGG
experiments in the paper [Group Normalization](https://arxiv.org/abs/1803.08494). experiments in the paper [Group Normalization](https://arxiv.org/abs/1803.08494).
See `./vgg16.py --help` for usage.
| No Normalization | Batch Normalization | Group Normalization | | No Normalization | Batch Normalization | Group Normalization |
|:------------------------------------------|---------------------|--------------------:| |:------------------------------------------|---------------------|--------------------:|
......
...@@ -126,7 +126,6 @@ if __name__ == '__main__': ...@@ -126,7 +126,6 @@ if __name__ == '__main__':
parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.') parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
parser.add_argument('--data', help='ILSVRC dataset dir') parser.add_argument('--data', help='ILSVRC dataset dir')
parser.add_argument('--batch', type=int, default=32, help='batch per GPU') parser.add_argument('--batch', type=int, default=32, help='batch per GPU')
parser.add_argument('--load', help='load model')
args = parser.parse_args() args = parser.parse_args()
if args.gpu: if args.gpu:
......
...@@ -8,7 +8,7 @@ import os ...@@ -8,7 +8,7 @@ import os
import tensorflow as tf import tensorflow as tf
from tensorpack import * from tensorpack import *
from tensorpack.tfutils import argscope, get_model_loader from tensorpack.tfutils import argscope
from tensorpack.tfutils.summary import * from tensorpack.tfutils.summary import *
from tensorpack.utils.gpu import get_nr_gpu from tensorpack.utils.gpu import get_nr_gpu
...@@ -151,7 +151,6 @@ if __name__ == '__main__': ...@@ -151,7 +151,6 @@ if __name__ == '__main__':
parser.add_argument('--data', help='ILSVRC dataset dir') parser.add_argument('--data', help='ILSVRC dataset dir')
parser.add_argument('--batch', type=int, default=32, help='batch per GPU') parser.add_argument('--batch', type=int, default=32, help='batch per GPU')
parser.add_argument('--norm', choices=['none', 'bn', 'gn'], default='none') parser.add_argument('--norm', choices=['none', 'bn', 'gn'], default='none')
parser.add_argument('--load', help='load model')
args = parser.parse_args() args = parser.parse_args()
if args.gpu: if args.gpu:
...@@ -160,8 +159,6 @@ if __name__ == '__main__': ...@@ -160,8 +159,6 @@ if __name__ == '__main__':
logger.set_logger_dir(os.path.join('train_log', 'vgg16-norm={}'.format(args.norm))) logger.set_logger_dir(os.path.join('train_log', 'vgg16-norm={}'.format(args.norm)))
config = get_config() config = get_config()
if args.load:
config.session_init = get_model_loader(args.load)
nr_tower = max(get_nr_gpu(), 1) nr_tower = max(get_nr_gpu(), 1)
trainer = SyncMultiGPUTrainerReplicated(nr_tower) trainer = SyncMultiGPUTrainerReplicated(nr_tower)
launch_train_with_config(config, trainer) launch_train_with_config(config, trainer)
...@@ -170,23 +170,38 @@ def get_tqdm_kwargs(**kwargs): ...@@ -170,23 +170,38 @@ def get_tqdm_kwargs(**kwargs):
bar_format='{l_bar}{bar}|{n_fmt}/{total_fmt}[{elapsed}<{remaining},{rate_noinv_fmt}]' bar_format='{l_bar}{bar}|{n_fmt}/{total_fmt}[{elapsed}<{remaining},{rate_noinv_fmt}]'
) )
f = kwargs.get('file', sys.stderr)
isatty = f.isatty()
# NOTE when run under mpirun/slurm, isatty is always False
# Jupyter notebook should be recognized as tty.
# Wait for https://github.com/ipython/ipykernel/issues/268
try: try:
from ipykernel import iostream # Use this env var to override the refresh interval setting
if isinstance(f, iostream.OutStream): interval = float(os.environ['TENSORPACK_PROGRESS_REFRESH'])
isatty = True except KeyError:
except ImportError:
pass f = kwargs.get('file', sys.stderr)
isatty = f.isatty()
if isatty: # Jupyter notebook should be recognized as tty.
default['mininterval'] = 0.5 # Wait for https://github.com/ipython/ipykernel/issues/268
else: try:
# If not a tty, don't refresh progress bar that often from ipykernel import iostream
default['mininterval'] = 180 if isinstance(f, iostream.OutStream):
isatty = True
except ImportError:
pass
if isatty:
interval = 0.5
else:
# When run under mpirun/slurm, isatty is always False.
# Here we apply some hacky heuristics for slurm.
if 'SLURM_JOB_ID' in os.environ:
if int(os.environ.get('SLURM_JOB_NUM_NODES', 1)) > 1:
# multi-machine job, probably not interactive
interval = 180
else:
# possibly interactive, so let's be conservative
interval = 15
# If not a tty, don't refresh progress bar that often
interval = 180
default['mininterval'] = interval
default.update(kwargs) default.update(kwargs)
return default return default
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment