Commit acd7f798 authored by Yuxin Wu's avatar Yuxin Wu

use relative id in TrainConfig.tower

parent f6acf786
...@@ -12,7 +12,7 @@ See some interesting [examples](https://github.com/ppwwyyxx/tensorpack/tree/mast ...@@ -12,7 +12,7 @@ See some interesting [examples](https://github.com/ppwwyyxx/tensorpack/tree/mast
## Features: ## Features:
Focused on modularity. Just have to define the three components to start a training: Focus on modularity. You just have to define the following three components to start a training:
1. The model, or the graph. Define the graph as well as its inputs and outputs. `models/` has some scoped abstraction of common models. 1. The model, or the graph. Define the graph as well as its inputs and outputs. `models/` has some scoped abstraction of common models.
......
...@@ -20,7 +20,7 @@ Cifar10: ...@@ -20,7 +20,7 @@ Cifar10:
91% accuracy after 80k step. 91% accuracy after 80k step.
19.3 step/s on Tesla M40 19.3 step/s on Tesla M40
Not a good for Cifar100, just for demonstration. Not a good model for Cifar100, just for demonstration.
""" """
class Model(ModelDesc): class Model(ModelDesc):
......
...@@ -13,10 +13,10 @@ from tensorpack.tfutils.symbolic_functions import * ...@@ -13,10 +13,10 @@ from tensorpack.tfutils.symbolic_functions import *
from tensorpack.tfutils.summary import * from tensorpack.tfutils.summary import *
""" """
SVHN convnet. A very small SVHN convnet model (only 0.8m parameters).
About 3.0% validation error after 70 epoch. 2.5% after 130 epoch. About 3.0% validation error after 70 epoch. 2.5% after 130 epoch.
Each epoch is set to 4721 iterations. The speed is about 44 it/s on a Tesla M30 Each epoch is set to 4721 iterations. The speed is about 44 it/s on a Tesla M40
""" """
class Model(ModelDesc): class Model(ModelDesc):
......
...@@ -32,6 +32,7 @@ class HistoryFramePlayer(ProxyPlayer): ...@@ -32,6 +32,7 @@ class HistoryFramePlayer(ProxyPlayer):
zeros = [np.zeros_like(self.history[0]) for k in range(diff_len)] zeros = [np.zeros_like(self.history[0]) for k in range(diff_len)]
for k in self.history: for k in self.history:
zeros.append(k) zeros.append(k)
assert len(zeros) == self.history.maxlen
return np.concatenate(zeros, axis=2) return np.concatenate(zeros, axis=2)
def action(self, act): def action(self, act):
......
...@@ -31,7 +31,8 @@ class TrainConfig(object): ...@@ -31,7 +31,8 @@ class TrainConfig(object):
:param starting_epoch: int. default to be 1. :param starting_epoch: int. default to be 1.
:param step_per_epoch: the number of steps (SGD updates) to perform in each epoch. :param step_per_epoch: the number of steps (SGD updates) to perform in each epoch.
:param max_epoch: maximum number of epoch to run training. default to inf :param max_epoch: maximum number of epoch to run training. default to inf
:param nr_tower: int. number of towers. default to 1. :param nr_tower: int. number of training towers. default to 1.
:param tower: list of training towers in relative id. default to `range(nr_tower)` if nr_tower is given.
:param extra_threads_procs: list of `Startable` threads or processes :param extra_threads_procs: list of `Startable` threads or processes
""" """
def assert_type(v, tp): def assert_type(v, tp):
...@@ -53,7 +54,17 @@ class TrainConfig(object): ...@@ -53,7 +54,17 @@ class TrainConfig(object):
self.starting_epoch = int(kwargs.pop('starting_epoch', 1)) self.starting_epoch = int(kwargs.pop('starting_epoch', 1))
self.max_epoch = int(kwargs.pop('max_epoch', 99999)) self.max_epoch = int(kwargs.pop('max_epoch', 99999))
assert self.step_per_epoch > 0 and self.max_epoch > 0 assert self.step_per_epoch > 0 and self.max_epoch > 0
self.nr_tower = int(kwargs.pop('nr_tower', 1))
nr_tower = kwargs.pop('nr_tower', None)
tower = kwargs.pop('tower', None)
assert nr_tower is None or tower is None, "Cannot set both nr_tower and tower!"
if nr_tower:
tower = list(range(nr_tower))
else:
if isinstance(tower, int):
tower = list(range(tower))
self.tower = tower
self.extra_threads_procs = kwargs.pop('extra_threads_procs', []) self.extra_threads_procs = kwargs.pop('extra_threads_procs', [])
assert len(kwargs) == 0, 'Unknown arguments: {}'.format(str(kwargs.keys())) assert len(kwargs) == 0, 'Unknown arguments: {}'.format(str(kwargs.keys()))
...@@ -38,13 +38,14 @@ class MultiGPUTrainer(QueueInputTrainer): ...@@ -38,13 +38,14 @@ class MultiGPUTrainer(QueueInputTrainer):
return ret return ret
def _multi_tower_grads(self): def _multi_tower_grads(self):
logger.info("Training a model of {} tower".format(self.config.nr_tower)) logger.info("Training a model of {} tower".format(
len(self.config.tower)))
grad_list = [] grad_list = []
for i in range(self.config.nr_tower): for idx, t in enumerate(self.config.tower):
with tf.device('/gpu:{}'.format(i)), \ with tf.device('/gpu:{}'.format(t)), \
tf.name_scope('tower{}'.format(i)) as scope: tf.name_scope('tower{}'.format(idx)) as scope:
logger.info("Building graph for training tower {}...".format(i)) logger.info("Building graph for training tower {}...".format(idx))
model_inputs = self._get_model_inputs() # each tower dequeue from input queue model_inputs = self._get_model_inputs() # each tower dequeue from input queue
self.dequed_inputs.append(model_inputs) self.dequed_inputs.append(model_inputs)
...@@ -55,7 +56,7 @@ class MultiGPUTrainer(QueueInputTrainer): ...@@ -55,7 +56,7 @@ class MultiGPUTrainer(QueueInputTrainer):
grad_list.append( grad_list.append(
self.config.optimizer.compute_gradients(cost_var, gate_gradients=0)) self.config.optimizer.compute_gradients(cost_var, gate_gradients=0))
if i == 0: if idx == 0:
tf.add_to_collection(MOVING_SUMMARY_VARS_KEY, cost_var) tf.add_to_collection(MOVING_SUMMARY_VARS_KEY, cost_var)
tf.get_variable_scope().reuse_variables() tf.get_variable_scope().reuse_variables()
# avoid repeated summary from each device # avoid repeated summary from each device
......
...@@ -138,7 +138,7 @@ class QueueInputTrainer(Trainer): ...@@ -138,7 +138,7 @@ class QueueInputTrainer(Trainer):
:param config: a `TrainConfig` instance :param config: a `TrainConfig` instance
:param input_queue: a `tf.QueueBase` instance to be used to buffer datapoints. :param input_queue: a `tf.QueueBase` instance to be used to buffer datapoints.
Defaults to a FIFO queue of size 100. Defaults to a FIFO queue of size 100.
:param predict_tower: list of gpu idx to run prediction. default to be [0]. :param predict_tower: list of gpu relative idx to run prediction. default to be [0].
Use -1 for cpu. Use -1 for cpu.
""" """
super(QueueInputTrainer, self).__init__(config) super(QueueInputTrainer, self).__init__(config)
...@@ -189,7 +189,7 @@ class QueueInputTrainer(Trainer): ...@@ -189,7 +189,7 @@ class QueueInputTrainer(Trainer):
self._extra_threads_procs.append(self.input_th) self._extra_threads_procs.append(self.input_th)
def train(self): def train(self):
assert self.config.nr_tower == 1, \ assert len(self.config.tower) == 1, \
"QueueInputTrainer doesn't support multigpu! Use Sync/AsyncMultiGPUTrainer instead." "QueueInputTrainer doesn't support multigpu! Use Sync/AsyncMultiGPUTrainer instead."
self.init_session_and_coord() self.init_session_and_coord()
self._build_enque_thread() self._build_enque_thread()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment