Commit dd138d5a authored by Yuxin Wu's avatar Yuxin Wu

silent autograph warnings; update docs

parent 23239bd7
...@@ -26,10 +26,10 @@ matrix: ...@@ -26,10 +26,10 @@ matrix:
env: TF_VERSION=1.3.0 TF_TYPE=release env: TF_VERSION=1.3.0 TF_TYPE=release
- os: linux - os: linux
python: 2.7 python: 2.7
env: TF_VERSION=1.12.0 TF_TYPE=release env: TF_VERSION=1.14.0 TF_TYPE=release
- os: linux - os: linux
python: 3.6 python: 3.6
env: TF_VERSION=1.12.0 TF_TYPE=release PYPI=true env: TF_VERSION=1.14.0 TF_TYPE=release PYPI=true
- os: linux - os: linux
python: 2.7 python: 2.7
env: TF_TYPE=nightly env: TF_TYPE=nightly
......
...@@ -66,15 +66,17 @@ Efficiency: ...@@ -66,15 +66,17 @@ Efficiency:
1. After warmup, the training speed will slowly decrease due to more accurate proposals. 1. After warmup, the training speed will slowly decrease due to more accurate proposals.
1. The code should have around 80~90% GPU utilization on V100s, and 85%~90% scaling 1. The code should have around 85~90% GPU utilization on one V100.
efficiency from 1 V100 to 8 V100s. Scalability isn't very meaningful since the amount of computation each GPU perform is data-dependent.
If all images have the same spatial size (in which case the per-GPU computation is *still different*),
then a 85%~90% scaling efficiency is observed when using 8 V100s and `HorovodTrainer`.
1. This implementation does not use specialized CUDA ops (e.g. AffineChannel, ROIAlign). 1. This implementation does not use specialized CUDA ops (e.g. AffineChannel, ROIAlign).
Therefore it might be slower than other highly-optimized implementations. Therefore it might be slower than other highly-optimized implementations.
1. To reduce RAM usage on host: (1) make sure you're using the "spawn" method as 1. To reduce RAM usage on host: (1) make sure you're using the "spawn" method as
set in `train.py`; (2) reduce `buffer_size` or `NUM_WORKERS` in `data.py` set in `train.py`; (2) reduce `buffer_size` or `NUM_WORKERS` in `data.py`
(which may negatively impact your throughput). The training needs <10G RAM if `NUM_WORKERS=0`. (which may negatively impact your throughput). The training only needs <10G RAM if `NUM_WORKERS=0`.
1. Inference is unoptimized. Tensorpack is a training interface, therefore it 1. Inference is unoptimized. Tensorpack is a training interface, therefore it
does not help you on optimized inference. In fact, the current implementation does not help you on optimized inference. In fact, the current implementation
......
...@@ -257,10 +257,6 @@ def finalize_configs(is_training): ...@@ -257,10 +257,6 @@ def finalize_configs(is_training):
if _C.TRAINER == 'horovod': if _C.TRAINER == 'horovod':
import horovod.tensorflow as hvd import horovod.tensorflow as hvd
ngpu = hvd.size() ngpu = hvd.size()
if ngpu == hvd.local_size():
logger.warn("It's not recommended to use horovod for single-machine training. "
"Replicated trainer is more stable and has the same efficiency.")
else: else:
assert 'OMPI_COMM_WORLD_SIZE' not in os.environ assert 'OMPI_COMM_WORLD_SIZE' not in os.environ
ngpu = get_num_gpu() ngpu = get_num_gpu()
......
...@@ -121,9 +121,10 @@ class TrainingDataPreprocessor: ...@@ -121,9 +121,10 @@ class TrainingDataPreprocessor:
def __init__(self, cfg): def __init__(self, cfg):
self.cfg = cfg self.cfg = cfg
self.aug = imgaug.AugmentorList( self.aug = imgaug.AugmentorList([
[CustomResize(cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE), imgaug.Flip(horiz=True)] CustomResize(cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE),
) imgaug.Flip(horiz=True)
])
def __call__(self, roidb): def __call__(self, roidb):
fname, boxes, klass, is_crowd = roidb["file_name"], roidb["boxes"], roidb["class"], roidb["is_crowd"] fname, boxes, klass, is_crowd = roidb["file_name"], roidb["boxes"], roidb["class"], roidb["is_crowd"]
......
...@@ -56,6 +56,15 @@ def internal_update_bn_ema(xn, batch_mean, batch_var, ...@@ -56,6 +56,15 @@ def internal_update_bn_ema(xn, batch_mean, batch_var,
return tf.identity(xn, name='output') return tf.identity(xn, name='output')
try:
# When BN is used as an activation, keras layers try to autograph.convert it
# This leads to massive warnings so we disable it.
from tensorflow.python.autograph.impl.api import do_not_convert as disable_autograph
except ImportError:
def disable_autograph():
return lambda x: x
@layer_register() @layer_register()
@convert_to_tflayer_args( @convert_to_tflayer_args(
args_names=[], args_names=[],
...@@ -66,6 +75,7 @@ def internal_update_bn_ema(xn, batch_mean, batch_var, ...@@ -66,6 +75,7 @@ def internal_update_bn_ema(xn, batch_mean, batch_var,
'decay': 'momentum', 'decay': 'momentum',
'use_local_stat': 'training' 'use_local_stat': 'training'
}) })
@disable_autograph()
def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
center=True, scale=True, center=True, scale=True,
beta_initializer=tf.zeros_initializer(), beta_initializer=tf.zeros_initializer(),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment