Commit a3581e79 authored by Yuxin Wu's avatar Yuxin Wu

update docs

parent 801e2921
......@@ -39,8 +39,10 @@ The tower function needs to follow some conventions:
To respect variable reuse, use `tf.get_variable` instead of `tf.Variable` in the function.
On the other hand, for non-trainable variables, it's OK to use
`tf.Variable` to ensure creation of new variables in each tower even when `reuse=True`.
4. It will always be called under a `TowerContext`, which can be accessed by `get_current_tower_contxt()`.
4. It will always be called under a `TowerContext`, which can be accessed by `get_current_tower_context()`.
The context contains information about training/inference mode, reuse, etc.
5. It cannot create scopes or variables containing the name 'tower', as it is
reserved for special use.
These conventions are easy to follow, and most layer wrappers (e.g.,
tf.layers/slim/tensorlayer) do follow them. Note that certain Keras layers do not
......
......@@ -19,7 +19,7 @@ from tensorpack import *
from tensorpack.utils.concurrency import ensure_proc_terminate, start_proc_mask_signal
from tensorpack.utils.serialize import dumps
from tensorpack.tfutils.gradproc import MapGradient, SummaryGradient
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu
import gym
......@@ -144,10 +144,10 @@ class MySimulatorMaster(SimulatorMaster, Callback):
def _setup_graph(self):
# create predictors on the available predictor GPUs.
nr_gpu = len(self._gpus)
num_gpu = len(self._gpus)
predictors = [self.trainer.get_predictor(
['state'], ['policy', 'pred_value'],
self._gpus[k % nr_gpu])
self._gpus[k % num_gpu])
for k in range(PREDICTOR_THREAD)]
self.async_predictor = MultiThreadAsyncPredictor(
predictors, batch_size=PREDICT_BATCH_SIZE)
......@@ -213,16 +213,16 @@ def train():
logger.set_logger_dir(dirname)
# assign GPUs for training & inference
nr_gpu = get_nr_gpu()
num_gpu = get_num_gpu()
global PREDICTOR_THREAD
if nr_gpu > 0:
if nr_gpu > 1:
if num_gpu > 0:
if num_gpu > 1:
# use half gpus for inference
predict_tower = list(range(nr_gpu))[-nr_gpu // 2:]
predict_tower = list(range(num_gpu))[-num_gpu // 2:]
else:
predict_tower = [0]
PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU
train_tower = list(range(nr_gpu))[:-nr_gpu // 2] or [0]
train_tower = list(range(num_gpu))[:-num_gpu // 2] or [0]
logger.info("[Batch-A3C] Train on gpu {} and infer on gpu {}".format(
','.join(map(str, train_tower)), ','.join(map(str, predict_tower))))
else:
......
......@@ -15,7 +15,7 @@ from tensorpack import *
from tensorpack.tfutils.summary import add_param_summary
from tensorpack.tfutils.varreplace import remap_variables
from tensorpack.dataflow import dataset
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu
from imagenet_utils import get_imagenet_dataflow, fbresnet_augmentor, ImageNetModel
from dorefa import get_dorefa, ternarize
......@@ -215,7 +215,7 @@ if __name__ == '__main__':
run_image(Model(), DictRestore(dict(np.load(args.load))), args.run)
sys.exit()
nr_tower = max(get_nr_gpu(), 1)
nr_tower = max(get_num_gpu(), 1)
BATCH_SIZE = TOTAL_BATCH_SIZE // nr_tower
logger.set_logger_dir(os.path.join(
'train_log', 'alexnet-dorefa-{}'.format(args.dorefa)))
......
......@@ -257,8 +257,8 @@ if __name__ == '__main__':
args = parser.parse_args()
with change_gpu(args.gpu):
NR_GPU = len(args.gpu.split(','))
NGPU = len(args.gpu.split(','))
config = get_config()
if args.load:
config.session_init = SaverRestore(args.load)
launch_train_with_config(config, SyncMultiGPUTrainer(NR_GPU))
launch_train_with_config(config, SyncMultiGPUTrainer(NGPU))
......@@ -5,7 +5,7 @@
from tensorpack import *
from tensorpack.tfutils.summary import add_moving_summary
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu
from tensorpack.tfutils.scope_utils import auto_reuse_variable_scope
import tensorflow as tf
......@@ -137,7 +137,7 @@ if __name__ == '__main__':
input = QueueInput(DCGAN.get_data())
model = Model()
nr_tower = max(get_nr_gpu(), 1)
nr_tower = max(get_num_gpu(), 1)
if nr_tower == 1:
trainer = GANTrainer(input, model)
else:
......
......@@ -149,10 +149,10 @@ class MultiGPUGANTrainer(TowerTrainer):
"""
A replacement of GANTrainer (optimize d and g one by one) with multi-gpu support.
"""
def __init__(self, nr_gpu, input, model):
def __init__(self, num_gpu, input, model):
super(MultiGPUGANTrainer, self).__init__()
assert nr_gpu > 1
raw_devices = ['/gpu:{}'.format(k) for k in range(nr_gpu)]
assert num_gpu > 1
raw_devices = ['/gpu:{}'.format(k) for k in range(num_gpu)]
# Setup input
input = StagingInput(input)
......@@ -167,13 +167,13 @@ class MultiGPUGANTrainer(TowerTrainer):
self.tower_func = TowerFuncWrapper(get_cost, model.get_inputs_desc())
devices = [LeastLoadedDeviceSetter(d, raw_devices) for d in raw_devices]
cost_list = DataParallelBuilder.build_on_towers(
list(range(nr_gpu)),
list(range(num_gpu)),
lambda: self.tower_func(*input.get_input_tensors()),
devices)
# Simply average the cost here. It might be faster to average the gradients
with tf.name_scope('optimize'):
d_loss = tf.add_n([x[0] for x in cost_list]) * (1.0 / nr_gpu)
g_loss = tf.add_n([x[1] for x in cost_list]) * (1.0 / nr_gpu)
d_loss = tf.add_n([x[0] for x in cost_list]) * (1.0 / num_gpu)
g_loss = tf.add_n([x[1] for x in cost_list]) * (1.0 / num_gpu)
opt = model.get_optimizer()
# run one d_min after one g_min
......
......@@ -12,7 +12,7 @@ import os
from tensorpack import *
from tensorpack.dataflow import dataset
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu
from tensorpack.tfutils import optimizer, gradproc
from tensorpack.tfutils.summary import add_moving_summary, add_param_summary
......@@ -256,4 +256,4 @@ if __name__ == '__main__':
config.session_init = get_model_loader(args.load)
launch_train_with_config(
config,
SyncMultiGPUTrainer(max(get_nr_gpu(), 1)))
SyncMultiGPUTrainer(max(get_num_gpu(), 1)))
......@@ -11,14 +11,14 @@ import tensorflow as tf
from tensorpack import *
from tensorpack.tfutils.summary import add_moving_summary
from tensorpack.dataflow import dataset
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu
from imagenet_utils import fbresnet_augmentor, get_imagenet_dataflow
# Change them if using different number of GPUs.
TOTAL_BATCH_SIZE = 64 * 6
NR_GPU = 6
BATCH_SIZE = TOTAL_BATCH_SIZE // NR_GPU
NUM_GPU = 6
BATCH_SIZE = TOTAL_BATCH_SIZE // NUM_GPU
INPUT_SHAPE = 224
......@@ -169,6 +169,6 @@ if __name__ == '__main__':
config = get_config()
if args.load:
config.session_init = SaverRestore(args.load)
nr_tower = get_nr_gpu()
assert nr_tower == NR_GPU
launch_train_with_config(config, SyncMultiGPUTrainer(NR_GPU))
nr_tower = get_num_gpu()
assert nr_tower == NUM_GPU
launch_train_with_config(config, SyncMultiGPUTrainer(NUM_GPU))
......@@ -14,7 +14,7 @@ from tensorpack import *
from tensorpack.dataflow import imgaug
from tensorpack.tfutils import argscope, get_model_loader, model_utils
from tensorpack.tfutils.scope_utils import under_name_scope
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu
from imagenet_utils import (
get_imagenet_dataflow,
......@@ -212,7 +212,7 @@ if __name__ == '__main__':
else:
logger.set_logger_dir(os.path.join('train_log', 'shufflenet'))
nr_tower = max(get_nr_gpu(), 1)
nr_tower = max(get_num_gpu(), 1)
config = get_config(model, nr_tower)
if args.load:
config.session_init = get_model_loader(args.load)
......
......@@ -10,7 +10,7 @@ import tensorflow as tf
from tensorpack import *
from tensorpack.tfutils import argscope
from tensorpack.tfutils.summary import *
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu
from imagenet_utils import (
ImageNetModel, get_imagenet_dataflow, fbresnet_augmentor)
......@@ -108,7 +108,7 @@ def get_data(name, batch):
def get_config():
nr_tower = max(get_nr_gpu(), 1)
nr_tower = max(get_num_gpu(), 1)
batch = args.batch
total_batch = batch * nr_tower
assert total_batch >= 256 # otherwise the learning rate warmup is wrong.
......@@ -159,6 +159,6 @@ if __name__ == '__main__':
logger.set_logger_dir(os.path.join('train_log', 'vgg16-norm={}'.format(args.norm)))
config = get_config()
nr_tower = max(get_nr_gpu(), 1)
nr_tower = max(get_num_gpu(), 1)
trainer = SyncMultiGPUTrainerReplicated(nr_tower)
launch_train_with_config(config, trainer)
......@@ -27,8 +27,8 @@ These are all the toy examples in tensorpack. They are supposed to be just demos
| --- | --- |
| Train [ResNet](ResNet), [ShuffleNet and other models](ImageNetModels) on ImageNet | reproduce paper |
| [Train Faster-RCNN / Mask-RCNN on COCO](FasterRCNN) | reproduce paper |
| [DoReFa-Net: training binary / low-bitwidth CNN on ImageNet](DoReFa-Net) | reproduce paper |
| [Generative Adversarial Network(GAN) variants](GAN), including DCGAN, InfoGAN, <br/> Conditional GAN, WGAN, BEGAN, DiscoGAN, Image to Image, CycleGAN | visually reproduce |
| [DoReFa-Net: training binary / low-bitwidth CNN on ImageNet](DoReFa-Net) | reproduce paper |
| [Fully-convolutional Network for Holistically-Nested Edge Detection(HED)](HED) | visually reproduce |
| [Spatial Transformer Networks on MNIST addition](SpatialTransformer) | reproduce paper |
| [Visualize CNN saliency maps](Saliency) | visually reproduce |
......
......@@ -9,7 +9,7 @@ import os
from tensorpack import *
from tensorpack.tfutils.summary import add_moving_summary, add_param_summary
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu
from tensorpack.dataflow import dataset
import tensorflow as tf
......@@ -170,5 +170,5 @@ if __name__ == '__main__':
max_epoch=400,
session_init=SaverRestore(args.load) if args.load else None
)
nr_gpu = max(get_nr_gpu(), 1)
launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpu))
num_gpu = max(get_num_gpu(), 1)
launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(num_gpu))
......@@ -12,7 +12,7 @@ from tensorpack.train import (
TrainConfig, SyncMultiGPUTrainerReplicated, launch_train_with_config)
from tensorpack.dataflow import FakeData
from tensorpack.tfutils import argscope, get_model_loader
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu
from imagenet_utils import (
fbresnet_augmentor, get_imagenet_dataflow, ImageNetModel,
......@@ -57,7 +57,7 @@ def get_data(name, batch):
def get_config(model, fake=False):
nr_tower = max(get_nr_gpu(), 1)
nr_tower = max(get_num_gpu(), 1)
assert args.batch % nr_tower == 0
batch = args.batch // nr_tower
......@@ -143,5 +143,5 @@ if __name__ == '__main__':
config = get_config(model, fake=args.fake)
if args.load:
config.session_init = get_model_loader(args.load)
trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1))
trainer = SyncMultiGPUTrainerReplicated(max(get_num_gpu(), 1))
launch_train_with_config(config, trainer)
......@@ -16,7 +16,7 @@ from tensorpack.dataflow import dataset
from tensorpack.tfutils import optimizer, gradproc
from tensorpack.tfutils.symbolic_functions import *
from tensorpack.tfutils.summary import *
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu
from tensorpack.utils import viz
from imagenet_utils import (
......@@ -157,8 +157,8 @@ if __name__ == '__main__':
if args.gpu:
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
nr_gpu = get_nr_gpu()
BATCH_SIZE = TOTAL_BATCH_SIZE // nr_gpu
num_gpu = get_num_gpu()
BATCH_SIZE = TOTAL_BATCH_SIZE // num_gpu
if args.cam:
BATCH_SIZE = 128 # something that can run on one gpu
......@@ -169,4 +169,4 @@ if __name__ == '__main__':
config = get_config()
if args.load:
config.session_init = get_model_loader(args.load)
launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpu))
launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(num_gpu))
......@@ -13,6 +13,7 @@ from tensorpack import *
from tensorpack.tfutils.scope_utils import auto_reuse_variable_scope
from tensorpack.tfutils.summary import add_moving_summary
from tensorpack.utils import logger
from tensorpack.utils.gpu import get_num_gpu
from data_sampler import (
ImageDecode, ImageDataFromZIPFile,
RejectTooSmallImages, CenterSquareResize)
......@@ -286,7 +287,7 @@ if __name__ == '__main__':
param_dict = {'VGG19/' + name: value for name, value in six.iteritems(param_dict)}
session_init = DictRestore(param_dict)
nr_tower = max(get_nr_gpu(), 1)
nr_tower = max(get_num_gpu(), 1)
data = QueueInput(get_data(args.data))
model = Model()
......
## Keras + Tensorpack
Use Keras to define a model a train it with efficient tensorpack trainers.
Use Keras to define a model and train it with efficient tensorpack trainers.
### Why?
Keras alone has various overhead. In particular, it is not efficient when working on large models.
Keras alone has various overhead. In particular, it is not efficient with large models.
The article [Towards Efficient Multi-GPU Training in Keras with TensorFlow](https://medium.com/rossum/towards-efficient-multi-gpu-training-in-keras-with-tensorflow-8a0091074fb2)
has mentioned some of it.
......
......@@ -11,7 +11,7 @@ import argparse
from tensorpack import InputDesc, SyncMultiGPUTrainerReplicated
from tensorpack.dataflow import FakeData, MapDataComponent
from tensorpack.utils import logger
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu
from tensorpack.contrib.keras import KerasModel
from tensorpack.callbacks import *
from tensorflow.python.keras.layers import *
......@@ -141,12 +141,12 @@ if __name__ == '__main__':
tf.keras.backend.set_image_data_format('channels_first')
nr_gpu = get_nr_gpu()
num_gpu = get_num_gpu()
if args.fake:
df_train = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False, dtype='uint8')
df_val = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False)
else:
batch_size = TOTAL_BATCH_SIZE // nr_gpu
batch_size = TOTAL_BATCH_SIZE // num_gpu
assert args.data is not None
df_train = get_imagenet_dataflow(
args.data, 'train', batch_size, fbresnet_augmentor(True))
......@@ -164,7 +164,7 @@ if __name__ == '__main__':
inputs_desc=[InputDesc(tf.uint8, [None, 224, 224, 3], 'images')],
targets_desc=[InputDesc(tf.float32, [None, 1000], 'labels')],
input=df_train,
trainer=SyncMultiGPUTrainerReplicated(nr_gpu))
trainer=SyncMultiGPUTrainerReplicated(num_gpu))
lr = tf.get_variable('learning_rate', initializer=0.1, trainable=False)
tf.summary.scalar('lr', lr)
......@@ -188,7 +188,7 @@ if __name__ == '__main__':
if not args.fake:
callbacks.append(
DataParallelInferenceRunner(
df_val, ScalarStats(['categorical_accuracy']), nr_gpu))
df_val, ScalarStats(['categorical_accuracy']), num_gpu))
M.fit(
steps_per_epoch=100 if args.fake else 1281167 // TOTAL_BATCH_SIZE,
......
......@@ -96,8 +96,11 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
When set to "nccl", this layer must be used under tensorpack multi-gpu trainers,
and it then uses per-machine (multiple GPU) statistics to normalize.
Note that this implementation averages the per-tower E[x] and E[x^2] among towers to compute
global mean&variance. The result is the global mean&variance only if each tower has the same batch size.
This option has no effect when not training.
The option is also known as "Cross-GPU BatchNorm" as mentioned in https://arxiv.org/abs/1711.07240.
This option is also known as "Cross-GPU BatchNorm" as mentioned in https://arxiv.org/abs/1711.07240.
Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222
Variable Names:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment