Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
f0243500
Commit
f0243500
authored
Oct 28, 2017
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'model-redesign'
parents
d38d22bf
a867fa57
Changes
79
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
79 changed files
with
1206 additions
and
732 deletions
+1206
-732
.github/ISSUE_TEMPLATE.md
.github/ISSUE_TEMPLATE.md
+2
-2
CHANGES.md
CHANGES.md
+8
-0
docs/conf.py
docs/conf.py
+1
-0
docs/tutorial/callback.md
docs/tutorial/callback.md
+42
-46
docs/tutorial/extend/trainer.md
docs/tutorial/extend/trainer.md
+21
-30
docs/tutorial/graph.md
docs/tutorial/graph.md
+0
-58
docs/tutorial/index.rst
docs/tutorial/index.rst
+1
-1
docs/tutorial/trainer.md
docs/tutorial/trainer.md
+34
-29
docs/tutorial/training-interface.md
docs/tutorial/training-interface.md
+67
-0
examples/A3C-Gym/train-atari.py
examples/A3C-Gym/train-atari.py
+3
-2
examples/CTC-TIMIT/train-timit.py
examples/CTC-TIMIT/train-timit.py
+3
-2
examples/Char-RNN/char-rnn.py
examples/Char-RNN/char-rnn.py
+3
-2
examples/DeepQNetwork/DQN.py
examples/DeepQNetwork/DQN.py
+3
-2
examples/DeepQNetwork/common.py
examples/DeepQNetwork/common.py
+16
-18
examples/DeepQNetwork/expreplay.py
examples/DeepQNetwork/expreplay.py
+1
-1
examples/DisturbLabel/mnist-disturb.py
examples/DisturbLabel/mnist-disturb.py
+2
-1
examples/DisturbLabel/svhn-disturb.py
examples/DisturbLabel/svhn-disturb.py
+2
-1
examples/DoReFa-Net/alexnet-dorefa.py
examples/DoReFa-Net/alexnet-dorefa.py
+3
-3
examples/DoReFa-Net/svhn-digit-dorefa.py
examples/DoReFa-Net/svhn-digit-dorefa.py
+3
-2
examples/DynamicFilterNetwork/steering-filter.py
examples/DynamicFilterNetwork/steering-filter.py
+3
-2
examples/FasterRCNN/train.py
examples/FasterRCNN/train.py
+5
-3
examples/GAN/BEGAN.py
examples/GAN/BEGAN.py
+8
-6
examples/GAN/ConditionalGAN-mnist.py
examples/GAN/ConditionalGAN-mnist.py
+8
-16
examples/GAN/CycleGAN.py
examples/GAN/CycleGAN.py
+3
-5
examples/GAN/DCGAN.py
examples/GAN/DCGAN.py
+4
-4
examples/GAN/DiscoGAN-CelebA.py
examples/GAN/DiscoGAN-CelebA.py
+4
-6
examples/GAN/GAN.py
examples/GAN/GAN.py
+39
-36
examples/GAN/Image2Image.py
examples/GAN/Image2Image.py
+15
-19
examples/GAN/Improved-WGAN.py
examples/GAN/Improved-WGAN.py
+4
-4
examples/GAN/InfoGAN-mnist.py
examples/GAN/InfoGAN-mnist.py
+9
-15
examples/GAN/WGAN.py
examples/GAN/WGAN.py
+7
-5
examples/HED/hed.py
examples/HED/hed.py
+4
-2
examples/Inception/inception-bn.py
examples/Inception/inception-bn.py
+4
-3
examples/Inception/inceptionv3.py
examples/Inception/inceptionv3.py
+2
-2
examples/PennTreebank/PTB-LSTM.py
examples/PennTreebank/PTB-LSTM.py
+2
-1
examples/ResNet/cifar10-resnet.py
examples/ResNet/cifar10-resnet.py
+3
-2
examples/ResNet/imagenet-resnet.py
examples/ResNet/imagenet-resnet.py
+5
-2
examples/ResNet/load-resnet.py
examples/ResNet/load-resnet.py
+1
-1
examples/ResNet/svhn-resnet.py
examples/ResNet/svhn-resnet.py
+0
-90
examples/Saliency/CAM-resnet.py
examples/Saliency/CAM-resnet.py
+9
-9
examples/Saliency/imagenet_resnet_utils.py
examples/Saliency/imagenet_resnet_utils.py
+0
-1
examples/Saliency/imagenet_utils.py
examples/Saliency/imagenet_utils.py
+1
-0
examples/Saliency/resnet_model.py
examples/Saliency/resnet_model.py
+1
-0
examples/ShuffleNet/shufflenet.py
examples/ShuffleNet/shufflenet.py
+6
-6
examples/SimilarityLearning/mnist-embeddings.py
examples/SimilarityLearning/mnist-embeddings.py
+2
-2
examples/SpatialTransformer/mnist-addition.py
examples/SpatialTransformer/mnist-addition.py
+2
-1
examples/boilerplate.py
examples/boilerplate.py
+3
-2
examples/cifar-convnet.py
examples/cifar-convnet.py
+6
-6
examples/mnist-convnet.py
examples/mnist-convnet.py
+2
-1
examples/mnist-tfslim.py
examples/mnist-tfslim.py
+2
-1
examples/mnist-visualizations.py
examples/mnist-visualizations.py
+2
-1
examples/svhn-digit-convnet.py
examples/svhn-digit-convnet.py
+3
-2
examples/tox.ini
examples/tox.ini
+1
-1
tensorpack/__init__.py
tensorpack/__init__.py
+2
-2
tensorpack/callbacks/inference.py
tensorpack/callbacks/inference.py
+1
-1
tensorpack/callbacks/inference_runner.py
tensorpack/callbacks/inference_runner.py
+1
-1
tensorpack/callbacks/param.py
tensorpack/callbacks/param.py
+1
-1
tensorpack/dataflow/common.py
tensorpack/dataflow/common.py
+2
-2
tensorpack/dataflow/dataset/ilsvrc.py
tensorpack/dataflow/dataset/ilsvrc.py
+1
-1
tensorpack/dataflow/prefetch.py
tensorpack/dataflow/prefetch.py
+3
-3
tensorpack/graph_builder/model_desc.py
tensorpack/graph_builder/model_desc.py
+16
-10
tensorpack/input_source/input_source.py
tensorpack/input_source/input_source.py
+8
-4
tensorpack/models/shape_utils.py
tensorpack/models/shape_utils.py
+2
-2
tensorpack/train/base.py
tensorpack/train/base.py
+156
-195
tensorpack/train/interface.py
tensorpack/train/interface.py
+17
-10
tensorpack/train/tower.py
tensorpack/train/tower.py
+167
-0
tensorpack/train/trainers.py
tensorpack/train/trainers.py
+37
-13
tensorpack/trainv1/__init__.py
tensorpack/trainv1/__init__.py
+1
-1
tensorpack/trainv1/base.py
tensorpack/trainv1/base.py
+358
-0
tensorpack/trainv1/config.py
tensorpack/trainv1/config.py
+15
-15
tensorpack/trainv1/distributed.py
tensorpack/trainv1/distributed.py
+1
-1
tensorpack/trainv1/interface.py
tensorpack/trainv1/interface.py
+11
-0
tensorpack/trainv1/multigpu.py
tensorpack/trainv1/multigpu.py
+6
-6
tensorpack/trainv1/simple.py
tensorpack/trainv1/simple.py
+1
-1
tensorpack/trainv1/utility.py
tensorpack/trainv1/utility.py
+8
-0
tensorpack/user_ops/test-recv-op.py
tensorpack/user_ops/test-recv-op.py
+1
-1
tensorpack/utils/fs.py
tensorpack/utils/fs.py
+1
-1
tensorpack/utils/loadcaffe.py
tensorpack/utils/loadcaffe.py
+1
-1
tox.ini
tox.ini
+3
-3
No files found.
.github/ISSUE_TEMPLATE.md
View file @
f0243500
Bug Reports/Feature Requests/Usage Questions Only:
Bug Reports:
Bug Reports
(including performance bug)
:
Some part of code (either the library or examples) doesn't work as expected.
Always include the following:
1.
What you did. (command you run if using examples; post or describe your code if not)
...
...
@@ -13,7 +13,7 @@ Feature Requests:
2.
Add a new feature. Please note that, you can implement a lot of features by extending tensorpack
(See http://tensorpack.readthedocs.io/en/latest/tutorial/index.html#extend-tensorpack).
It may not have to be added to tensorpack unless you have a good reason.
3.
Note that we don't implement papers at other
's
requests.
3.
Note that we don't implement papers at other
s'
requests.
Usage Questions, e.g.:
"How do I do [this specific thing] in tensorpack?"
...
...
CHANGES.md
View file @
f0243500
...
...
@@ -8,6 +8,14 @@ so you won't need to look at here very often.
Here are a list of things that were changed, starting from an early version.
TensorFlow itself also changed APIs before 1.0 and those are not listed here.
+
[2017/10/21]
tensorpack is gradually switching to a new Trainer API.
The old API will keep working for a while.
To switch to new API, the easiest way is to:
1. `export TENSORPACK_TRAIN_API=v2` (will be default soon in the future).
2. Replace `SomeTrainer(config, ...).train()` with `launch_train_with_config(config, SomeTrainer(...))`.
+
[2017/10/18]
`TrainConfig(predict_tower)`
was deprecated. You can set the inference device directly when creating the
`InferenceRunner`
callback.
+
[
2017/10/12
](
https://github.com/ppwwyyxx/tensorpack/commit/7e963996f615b85f7459455596b4ee9bbd0bce8e
)
.
...
...
docs/conf.py
View file @
f0243500
...
...
@@ -367,6 +367,7 @@ def autodoc_skip_member(app, what, name, obj, skip, options):
'VisualQA'
,
'huber_loss'
,
'DumpTensor'
,
'StagingInputWrapper'
,
'StepTensorPrinter'
]:
return
True
...
...
docs/tutorial/callback.md
View file @
f0243500
...
...
@@ -25,9 +25,7 @@ Therefore these features can be reused with one single line, as long as you are
For example, these are the callbacks I used when training a ResNet:
```
python
TrainConfig
(
# ...
callbacks
=
[
callbacks
=
[
# save the model every epoch
ModelSaver
(),
# backup the model with best validation error
...
...
@@ -39,7 +37,7 @@ TrainConfig(
# schedule the learning rate based on epoch number
ScheduledHyperParamSetter
(
'learning_rate'
,
[(
30
,
1e-2
),
(
60
,
1e-3
),
(
85
,
1e-4
),
(
95
,
1e-5
)]),
# can manually set the learning rat
e during training
# can manually change the learning rate through a fil
e during training
HumanHyperParamSetter
(
'learning_rate'
),
# send validation error to my phone through pushbullet
SendStat
(
'curl -u your_id_xxx: https://api.pushbullet.com/v2/pushes
\\
...
...
@@ -50,8 +48,7 @@ TrainConfig(
GPUUtilizationTracker
(),
# can pause the training and start a debug shell, to observe what's going on
InjectShell
(
shell
=
'ipython'
)
],
extra_callbacks
=
[
# these callbacks are enabled by default already
]
+
[
# these callbacks are enabled by default already, though you can customize them
# maintain those moving average summaries already defined in the model (e.g. training loss, training error)
MovingAverageSummary
(),
# draw a nice progress bar
...
...
@@ -60,23 +57,22 @@ TrainConfig(
MergeAllSummaries
(),
# run ops in GraphKeys.UPDATE_OPS collection along with training, if any
RunUpdateOps
(),
],
monitors
=
[
# monitors are a special kind of callbacks. these are also enabled by default
],
monitors
=
[
# monitors are a special kind of callbacks. these are also enabled by default
# write everything to tensorboard
TFEventWriter
(),
# write all scalar data to a json file, for easy parsing
JSONWriter
(),
# print all scalar data every epoch (can be configured differently)
ScalarPrinter
(),
]
)
]
```
Notice that callbacks cover every detail of training, ranging from graph operations to the progress bar.
This means you can customize every part of the training to your preference, e.g. display something
different in the progress bar, evaluating part of the summaries at a different frequency, etc.
These features may not be always useful, but think about how messy the main loop would look like if you
were to write the logic together with the loops, and how easy your life will be if you could enable
were to write the
se
logic together with the loops, and how easy your life will be if you could enable
these features with one line when you need them.
See
[
Write a callback
](
http://tensorpack.readthedocs.io/en/latest/tutorial/extend/callback.html
)
...
...
docs/tutorial/extend/trainer.md
View file @
f0243500
## Write a Trainer
**These contents are subject to change in later versions soon**
.
The existing trainers should be enough for single-cost optimization tasks.
If you want to do something different during training, first consider writing it as a callback,
The existing trainers should be enough for single-tower single-cost optimization tasks.
If you just want to do some extra work during training, first consider writing it as a callback,
or write an issue to see if there is a better solution than creating new trainers.
If your task is fundamentally different from single-cost optimization, you may need to write a trainer.
Trainers are recently being redesigned, the they best wayt to customize the trainer will likely to change.
We leave the tutorial empty for now.
<!--
-Trainers just run __some__ iterations, so there is no limit in where the data come from or what to do in an iteration.
-The existing common trainers all implement two things:
-1. Setup the graph and input pipeline, using the given
`TrainConfig`
.
-2. Minimize
`model.cost`
in each iteration.
-
-But you can customize it by using the base
`Trainer`
class.
-
-
*
To customize the graph:
-
-
Add any tensors and ops you like, either before creating the trainer or inside
`Trainer.__init__`
.
-
In this case you don't need to set model/data in
`TrainConfig`
any more.
-
-
*
Two ways to customize the iteration:
-
-
1. Set
`Trainer.train_op`
. This op will be run by default.
-
2. Subclass
`Trainer`
and override the
`run_step()`
method. This way you can do something more than running an op.
-
-There are several different
[
GAN trainers
](
../../examples/GAN/GAN.py
)
for reference.
-The implementation of
[
SimpleTrainer
](
../../tensorpack/train/simple.py
)
may also be helpful.
-->
If your task is fundamentally different from single-cost optimization, you will need to write a trainer.
Trainers just run __some__ iterations, so there is no limit in where the data come from or what to do in an iteration.
The existing common trainers all implement two things:
1.
Setup the graph and input pipeline, using the given
`InputSource`
and
`get_cost_fn`
.
2.
Minimize
`model.cost`
in each iteration.
But you can customize it by using or inheriting the base
`Trainer`
class.
You will need to define two things for a new Trainer:
1.
What is the graph.
Add any tensors and ops you like, either before creating the trainer or inside
`Trainer.__init__`
.
*
What is the iteration. There are 2 ways to define an iteration:
1.
Set
`Trainer.train_op`
. This op will be run by default.
2.
Subclass
`Trainer`
and override the
`run_step()`
method. This way you can do something more than running an op.
There are several different
[
GAN trainers
](
../../examples/GAN/GAN.py
)
for reference.
docs/tutorial/graph.md
deleted
100644 → 0
View file @
d38d22bf
# Build the Graph
This tutorial explains how a graph is built in tensorpack.
### ModelDesc
`ModelDesc`
is an abstraction over the most common type of models people train.
It assumes:
1.
Training is a single-cost optimized by a single
`tf.train.Optimizer`
.
2.
The graph can be trivially duplicated for data-parallel training or inference.
If your task is single-cost optimization,
you can subclass
`ModelDesc`
and implement several methods:
```
python
class
MyModel
(
ModelDesc
):
def
_get_inputs
(
self
):
return
[
InputDesc
(
...
),
InputDesc
(
...
)]
def
_build_graph
(
self
,
inputs
):
tensorA
,
tensorB
=
inputs
# build the graph
self
.
cost
=
xxx
# define the cost tensor
def
_get_optimizer
(
self
):
return
tf
.
train
.
GradientDescentOptimizer
(
0.1
)
```
`_get_inputs`
should define the metainfo of all the inputs your graph may need.
`_build_graph`
should add tensors/operations to the graph, where
the argument
`inputs`
is the list of input tensors matching
`_get_inputs`
.
You can use any symbolic functions in
`_build_graph`
, including TensorFlow core library
functions and other symbolic libraries.
### How it is Used:
Most tensorpack trainers expect a
`ModelDesc`
, and use it as a __description
of the TF graph to be built__.
These trainers will use
`_get_inputs`
to connect the given
`InputSource`
to the graph.
They'll then use
`_build_graph`
to create the backbone model, and then
`_get_optimizer`
to create the minimization op, and run it.
Note that data-parallel multi-GPU trainers will call
`_build_graph`
__multiple times__ on each GPU.
A trainer may also make __extra calls__ to
`_build_graph`
for inference, if used by some callbacks.
`_build_graph`
will always be called under some
`TowerContext`
which contains these context information
(e.g. training or inference, reuse or not, scope name) for your access.
Also, to respect variable reuse among multiple calls, use
`tf.get_variable()`
instead of
`tf.Variable`
in
`_build_graph`
,
if you need to create any variables.
### Build It Manually
When you need to deal with complicated graph, it may be easier to build the graph manually.
You are free to do so as long as you tell the trainer what to do in each step.
Check out
[
Write a Trainer
](
extend/trainer.html
)
for using a custom graph with trainer.
docs/tutorial/index.rst
View file @
f0243500
...
...
@@ -39,9 +39,9 @@ User Tutorials
dataflow
input-source
efficient-dataflow
graph
symbolic
trainer
training-interface
callback
summary
faq
...
...
docs/tutorial/trainer.md
View file @
f0243500
# Trainer
Tensorpack follows the "define-and-run" paradigm. A training has two steps:
1.
Build graph for the model.
Users can call whatever tensorflow functions to setup the graph.
Users may or may not use tensorpack
`InputSource`
,
`ModelDesc`
to build the graph.
This step defines "what to run" in every training step.
It can happen either inside or outside the trainer.
2.
Train the model (the
[
Trainer.train() method
](
http://tensorpack.readthedocs.io/en/latest/modules/train.html#tensorpack.train.Trainer.train
)
):
1. Setup callbacks/monitors.
2. Finalize the graph, initialize session.
3. Run the main loop.
## Assumptions of Base Trainer
In research we do training of various kind.
Tensorpack trainers try to avoid making assumptions on what type of training
you want to do (e.g., it doesn't have to be batched, SGD-like, or have
`X`
(inputs) and
`y`
(outputs)).
The only assumption tensorpack
`Trainer`
class makes about your training, is that your training
follows this pattern:
```
python
...
...
@@ -15,50 +34,36 @@ Tensorpack base trainer implements the logic of __running the iteration__.
Users or derived trainers should implement __what the iteration is__.
2.
Trainer assumes the existence of __"epoch"__, i.e. that the iterations run in double for-loops.
But
an epoch doesn't need to be a full pass of your dataset, the size of an epoch can
be any number you set
But
the epoch size can actually
be any number you set
and it only affects the
[
schedule of callbacks
](
extend/callback.html
)
.
In other words, an "epoch" in tensorpack is the __default period to run callbacks__ (validation, summary, checkpoint, etc.).
###
Common
Trainers
###
Single-Cost
Trainers
Most neural network training tasks are single-cost optimization.
Tensorpack provides some trainer implementations for such tasks.
These trainers will build the graph based on the given
`ModelDesc`
, and minimizes
`ModelDesc.cost`
.
<!--
-To use trainers, pass a
`TrainConfig`
to configure them:
-
-
```python
-config = TrainConfig(
- model=MyModel()
- dataflow=my_dataflow,
- # data=my_inputsource, # alternatively, use a customized InputSource
- callbacks=[...]
- )
-
-# start training:
-SomeTrainer(config, other_arguments).train()
-
-# start multi-GPU training with synchronous update:
-# SyncMultiGPUTrainerParameterServer(config).train()
-```
-
-When you set the DataFlow (rather than the InputSource) in the config,
-tensorpack trainers automatically adopt certain prefetch mechanism, as mentioned
-in the
[
Input Pipeline
](
input-source.html
)
tutorial.
-You can set the InputSource instead, to customize this behavior.
-->
Trainers are being redesigned, so the recommended API will likely be changed soon.
These trainers will build the graph by itself, with the following arguments:
1.
Some
`InputDesc`
, the metadata about the input.
2.
An
`InputSource`
, where the input come from. See
[
Input Pipeline
](
input-source.html
)
.
3.
A function which takes input tensors and returns the cost.
4.
A function which returns an optimizer.
See
[
SingleCostTrainer.setup_graph
](
http://tensorpack.readthedocs.io/en/latest/modules/train.html#tensorpack.train.SingleCostTrainer.setup_graph
)
for details.
Existing multi-GPU trainers include the logic of data-parallel training.
You can enable them by just one line, and all the necessary logic to achieve the best performance was baked into the trainers already.
The trainers can reach the same performance as the
[
official tensorflow benchmark
](
https://www.tensorflow.org/performance/benchmarks
)
.
Please note that in data-parallel training, in each iteration all towers (all replicates of the model) will take
tensors from the
InputSource
(instead of taking one for all and split). So the total batch size
tensors from the
`InputSource`
(instead of taking one for all and split). So the total batch size
would be
``(batch size of InputSource/DataFlow) * #GPU``
.
There are also high-level wrappers that have slightly simpler interface (but exist mainly for old users).
See
[
High-Level Training Interface
](
training-interface.html
)
### Custom Trainers
You can easily write a trainer for other types of training.
...
...
docs/tutorial/training-interface.md
0 → 100644
View file @
f0243500
# Training Interface
Tensorpack trainers have an interface for maximum flexibility.
There are also interfaces built on top of trainers to simplify the use,
when you don't want to customize too much.
### Raw Trainer Interface
For general trainer, build the graph by yourself.
For single-cost trainer, build the graph by
[
SingleCostTrainer.setup_graph
](
http://tensorpack.readthedocs.io/en/latest/modules/train.html#tensorpack.train.SingleCostTrainer.setup_graph
)
.
Then, call
[
Trainer.train()
](
http://tensorpack.readthedocs.io/en/latest/modules/train.html#tensorpack.train.Trainer.train
)
or
[
Trainer.train_with_defaults()
](
http://tensorpack.readthedocs.io/en/latest/modules/train.html#tensorpack.train.Trainer.train_with_defaults
)
which applies some defaults options for normal use cases.
### With ModelDesc and TrainConfig
[
SingleCost trainers
](
trainer.html#single-cost-trainers
)
expects 4 arguments in
`setup_graph`
:
`InputDesc`
,
`InputSource`
, get_cost function, and an optimizer.
`ModelDesc`
describes a model by packing 3 of them together into one object:
```
python
class
MyModel
(
ModelDesc
):
def
_get_inputs
(
self
):
return
[
InputDesc
(
...
),
InputDesc
(
...
)]
def
_build_graph
(
self
,
inputs
):
tensorA
,
tensorB
=
inputs
# build the graph
self
.
cost
=
xxx
# define the cost tensor
def
_get_optimizer
(
self
):
return
tf
.
train
.
GradientDescentOptimizer
(
0.1
)
```
`_get_inputs`
should define the metainfo of all the inputs your graph will take to build.
`_build_graph`
takes a list of
`inputs`
tensors which will match
`_get_inputs`
.
You can use any symbolic functions in
`_build_graph`
, including TensorFlow core library
functions and other symbolic libraries.
But you need to follow the requirement of
[
get_cost_fn
](
http://tensorpack.readthedocs.io/en/latest/modules/train.html#tensorpack.train.SingleCostTrainer.setup_graph
)
,
because this function will be used as part of
`get_cost_fn`
.
At last you need to set
`self.cost`
.
After defining such a model, use it with
`TrainConfig`
and
`launch_train_with_config`
:
```
python
config
=
TrainConfig
(
model
=
MyModel
()
dataflow
=
my_dataflow
,
# data=my_inputsource, # alternatively, use a customized InputSource
callbacks
=
[
...
]
)
trainer
=
SomeTrainer
()
# trainer = SyncMultiGPUTrainerParameterServer([0, 1, 2])
launch_train_with_config
(
config
,
trainer
)
```
See the docs of
[
launch_train_with_config
](
http://tensorpack.readthedocs.io/en/latest/modules/train.html#tensorpack.train.launch_train_with_config
)
for its usage and detailed functionalities.
examples/A3C-Gym/train-atari.py
View file @
f0243500
...
...
@@ -18,6 +18,7 @@ import tensorflow as tf
import
six
from
six.moves
import
queue
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.utils.concurrency
import
*
from
tensorpack.utils.serialize
import
*
...
...
@@ -303,5 +304,5 @@ if __name__ == '__main__':
config
=
get_config
()
if
args
.
load
:
config
.
session_init
=
get_model_loader
(
args
.
load
)
trainer
=
QueueInputTrainer
if
config
.
nr_tower
==
1
else
AsyncMultiGPUTrainer
trainer
(
config
)
.
train
(
)
trainer
=
SimpleTrainer
()
if
config
.
nr_tower
==
1
else
AsyncMultiGPUTrainer
(
config
.
tower
)
launch_train_with_config
(
config
,
trainer
)
examples/CTC-TIMIT/train-timit.py
View file @
f0243500
...
...
@@ -12,6 +12,7 @@ import operator
import
six
from
six.moves
import
map
,
range
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.tfutils.gradproc
import
SummaryGradient
,
GlobalNormClip
from
tensorpack.utils.globvars
import
globalns
as
param
...
...
@@ -94,7 +95,7 @@ def get_data(path, isTrain, stat_file):
def
get_config
(
ds_train
,
ds_test
):
return
TrainConfig
(
data
flow
=
ds_train
,
data
=
QueueInput
(
ds_train
)
,
callbacks
=
[
ModelSaver
(),
StatMonitorParamSetter
(
'learning_rate'
,
'error'
,
...
...
@@ -128,4 +129,4 @@ if __name__ == '__main__':
config
=
get_config
(
ds_train
,
ds_test
)
if
args
.
load
:
config
.
session_init
=
SaverRestore
(
args
.
load
)
QueueInputTrainer
(
config
)
.
train
(
)
launch_train_with_config
(
config
,
SimpleTrainer
()
)
examples/Char-RNN/char-rnn.py
View file @
f0243500
...
...
@@ -12,6 +12,7 @@ import operator
import
six
from
six.moves
import
map
,
range
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.tfutils
import
symbolic_functions
,
summary
,
optimizer
from
tensorpack.tfutils.gradproc
import
GlobalNormClip
...
...
@@ -116,7 +117,7 @@ def get_config():
ds
=
BatchData
(
ds
,
param
.
batch_size
)
return
TrainConfig
(
data
flow
=
ds
,
data
=
QueueInput
(
ds
)
,
callbacks
=
[
ModelSaver
(),
ScheduledHyperParamSetter
(
'learning_rate'
,
[(
25
,
2e-4
)])
...
...
@@ -190,4 +191,4 @@ if __name__ == '__main__':
config
=
get_config
()
if
args
.
load
:
config
.
session_init
=
SaverRestore
(
args
.
load
)
QueueInputTrainer
(
config
)
.
train
(
)
launch_train_with_config
(
config
,
SimpleTrainer
()
)
examples/DeepQNetwork/DQN.py
View file @
f0243500
...
...
@@ -16,6 +16,7 @@ import multiprocessing
import
threading
from
collections
import
deque
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.utils.concurrency
import
*
import
tensorflow
as
tf
...
...
@@ -105,7 +106,7 @@ def get_config():
)
return
TrainConfig
(
data
flow
=
expreplay
,
data
=
QueueInput
(
expreplay
)
,
model
=
Model
(),
callbacks
=
[
ModelSaver
(),
...
...
@@ -166,4 +167,4 @@ if __name__ == '__main__':
config
=
get_config
()
if
args
.
load
:
config
.
session_init
=
get_model_loader
(
args
.
load
)
QueueInputTrainer
(
config
)
.
train
(
)
launch_train_with_config
(
config
,
SimpleTrainer
()
)
examples/DeepQNetwork/common.py
View file @
f0243500
...
...
@@ -79,7 +79,7 @@ def eval_with_funcs(predictors, nr_eval, get_player_fn):
k
.
start
()
time
.
sleep
(
0.1
)
# avoid simulator bugs
stat
=
StatCounter
()
try
:
for
_
in
tqdm
(
range
(
nr_eval
),
**
get_tqdm_kwargs
()):
r
=
q
.
get
()
stat
.
feed
(
r
)
...
...
@@ -91,9 +91,7 @@ def eval_with_funcs(predictors, nr_eval, get_player_fn):
while
q
.
qsize
():
r
=
q
.
get
()
stat
.
feed
(
r
)
except
:
logger
.
exception
(
"Eval"
)
finally
:
if
stat
.
count
>
0
:
return
(
stat
.
average
,
stat
.
max
)
return
(
0
,
0
)
...
...
examples/DeepQNetwork/expreplay.py
View file @
f0243500
...
...
@@ -258,7 +258,7 @@ class ExpReplay(DataFlow, Callback):
mean
,
max
=
v
.
average
,
v
.
max
self
.
trainer
.
monitors
.
put_scalar
(
'expreplay/mean_score'
,
mean
)
self
.
trainer
.
monitors
.
put_scalar
(
'expreplay/max_score'
,
max
)
except
:
except
Exception
:
logger
.
exception
(
"Cannot log training scores."
)
v
.
reset
()
...
...
examples/DisturbLabel/mnist-disturb.py
View file @
f0243500
...
...
@@ -8,6 +8,7 @@ import os
import
sys
import
argparse
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.dataflow
import
dataset
import
tensorflow
as
tf
...
...
@@ -65,4 +66,4 @@ if __name__ == '__main__':
os
.
environ
[
'CUDA_VISIBLE_DEVICES'
]
=
args
.
gpu
config
=
get_config
()
QueueInputTrainer
(
config
)
.
train
(
)
launch_train_with_config
(
config
,
SimpleTrainer
()
)
examples/DisturbLabel/svhn-disturb.py
View file @
f0243500
...
...
@@ -8,6 +8,7 @@ import numpy as np
import
os
import
imp
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.dataflow
import
dataset
...
...
@@ -56,4 +57,4 @@ if __name__ == '__main__':
os
.
environ
[
'CUDA_VISIBLE_DEVICES'
]
=
'0'
config
=
get_config
(
args
.
prob
)
QueueInputTrainer
(
config
)
.
train
(
)
launch_train_with_config
(
config
,
SimpleTrainer
()
)
examples/DoReFa-Net/alexnet-dorefa.py
View file @
f0243500
...
...
@@ -11,6 +11,7 @@ import multiprocessing
import
os
import
sys
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.tfutils.symbolic_functions
import
*
from
tensorpack.tfutils.summary
import
*
...
...
@@ -226,7 +227,7 @@ def get_data(dataset_name):
ds
=
AugmentImageComponent
(
ds
,
augmentors
,
copy
=
False
)
ds
=
BatchData
(
ds
,
BATCH_SIZE
,
remainder
=
not
isTrain
)
if
isTrain
:
ds
=
PrefetchDataZMQ
(
ds
,
min
(
12
,
multiprocessing
.
cpu_count
()))
ds
=
PrefetchDataZMQ
(
ds
,
min
(
25
,
multiprocessing
.
cpu_count
()))
return
ds
...
...
@@ -321,5 +322,4 @@ if __name__ == '__main__':
config
=
get_config
()
if
args
.
load
:
config
.
session_init
=
SaverRestore
(
args
.
load
)
config
.
nr_tower
=
nr_tower
SyncMultiGPUTrainer
(
config
)
.
train
()
launch_train_with_config
(
config
,
SyncMultiGPUTrainer
(
nr_tower
))
examples/DoReFa-Net/svhn-digit-dorefa.py
View file @
f0243500
...
...
@@ -7,6 +7,7 @@ import argparse
import
numpy
as
np
import
os
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.tfutils.symbolic_functions
import
*
from
tensorpack.tfutils.summary
import
*
...
...
@@ -163,7 +164,7 @@ def get_config():
data_test
=
BatchData
(
data_test
,
128
,
remainder
=
True
)
return
TrainConfig
(
data
flow
=
data_train
,
data
=
QueueInput
(
data_train
)
,
callbacks
=
[
ModelSaver
(),
InferenceRunner
(
data_test
,
...
...
@@ -183,4 +184,4 @@ if __name__ == '__main__':
BITW
,
BITA
,
BITG
=
map
(
int
,
args
.
dorefa
.
split
(
','
))
config
=
get_config
()
QueueInputTrainer
(
config
)
.
train
(
)
launch_train_with_config
(
config
,
SimpleTrainer
()
)
examples/DynamicFilterNetwork/steering-filter.py
View file @
f0243500
...
...
@@ -6,10 +6,12 @@ import argparse
import
numpy
as
np
import
tensorflow
as
tf
import
cv2
import
os
from
scipy.signal
import
convolve2d
from
six.moves
import
range
,
zip
import
multiprocessing
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.utils
import
logger
from
tensorpack.utils.viz
import
*
...
...
@@ -262,5 +264,4 @@ if __name__ == '__main__':
config
=
get_config
()
if
args
.
load
:
config
.
session_init
=
SaverRestore
(
args
.
load
)
config
.
nr_tower
=
NR_GPU
SyncMultiGPUTrainer
(
config
)
.
train
()
launch_train_with_config
(
config
,
SyncMultiGPUTrainer
(
NR_GPU
))
examples/FasterRCNN/train.py
View file @
f0243500
...
...
@@ -13,6 +13,7 @@ import numpy as np
import
json
import
tensorflow
as
tf
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
import
tensorpack.tfutils.symbolic_functions
as
symbf
from
tensorpack.tfutils.summary
import
add_moving_summary
...
...
@@ -222,12 +223,13 @@ class EvalCallback(Callback):
def
_setup_graph
(
self
):
self
.
pred
=
self
.
trainer
.
get_predictor
([
'image'
],
[
'fastrcnn_fg_probs'
,
'fastrcnn_fg_boxes'
])
self
.
df
=
PrefetchDataZMQ
(
get_eval_dataflow
(),
1
)
get_tf_nms
()
# just to make sure the nms part of graph is created
def
_before_train
(
self
):
EVAL_TIMES
=
5
# eval 5 times during training
interval
=
self
.
trainer
.
max_epoch
//
(
EVAL_TIMES
+
1
)
self
.
epochs_to_eval
=
set
([
interval
*
k
for
k
in
range
(
1
,
EVAL_TIMES
)])
self
.
epochs_to_eval
.
add
(
self
.
trainer
.
max_epoch
)
get_tf_nms
()
# just to make sure the nms part of graph is created
def
_eval
(
self
):
all_results
=
eval_on_dataflow
(
self
.
df
,
lambda
img
:
detect_one_image
(
img
,
self
.
pred
))
...
...
@@ -300,6 +302,6 @@ if __name__ == '__main__':
steps_per_epoch
=
stepnum
,
max_epoch
=
230000
*
factor
//
stepnum
,
session_init
=
get_model_loader
(
args
.
load
)
if
args
.
load
else
None
,
nr_tower
=
get_nr_gpu
()
)
SyncMultiGPUTrainerReplicated
(
cfg
,
gpu_prefetch
=
False
)
.
train
()
trainer
=
SyncMultiGPUTrainerReplicated
(
get_nr_gpu
())
launch_train_with_config
(
cfg
,
trainer
)
examples/GAN/BEGAN.py
View file @
f0243500
...
...
@@ -6,6 +6,7 @@
import
os
import
argparse
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.tfutils.summary
import
add_moving_summary
from
tensorpack.utils.gpu
import
get_nr_gpu
...
...
@@ -145,8 +146,6 @@ if __name__ == '__main__':
logger
.
auto_set_dir
()
config
=
TrainConfig
(
model
=
Model
(),
dataflow
=
DCGAN
.
get_data
(
args
.
data
),
callbacks
=
[
ModelSaver
(),
StatMonitorParamSetter
(
...
...
@@ -155,9 +154,12 @@ if __name__ == '__main__':
steps_per_epoch
=
500
,
max_epoch
=
400
,
session_init
=
SaverRestore
(
args
.
load
)
if
args
.
load
else
None
,
nr_tower
=
max
(
get_nr_gpu
(),
1
)
)
if
config
.
nr_tower
==
1
:
GANTrainer
(
config
)
.
train
()
input
=
QueueInput
(
DCGAN
.
get_data
(
args
.
data
))
model
=
Model
()
nr_tower
=
max
(
get_nr_gpu
(),
1
)
if
nr_tower
==
1
:
trainer
=
GANTrainer
(
input
,
model
)
else
:
MultiGPUGANTrainer
(
config
)
.
train
()
trainer
=
MultiGPUGANTrainer
(
nr_tower
,
input
,
model
)
trainer
.
train_with_config
(
config
)
examples/GAN/ConditionalGAN-mnist.py
View file @
f0243500
...
...
@@ -10,6 +10,7 @@ import sys
import
cv2
import
argparse
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.utils.viz
import
*
import
tensorpack.tfutils.symbolic_functions
as
symbf
...
...
@@ -104,18 +105,6 @@ def get_data():
return
BatchData
(
ds
,
BATCH
)
def
get_config
():
logger
.
auto_set_dir
()
dataset
=
get_data
()
return
TrainConfig
(
dataflow
=
dataset
,
callbacks
=
[
ModelSaver
()],
model
=
Model
(),
steps_per_epoch
=
500
,
max_epoch
=
100
,
)
def
sample
(
model_path
):
pred
=
PredictConfig
(
session_init
=
get_model_loader
(
model_path
),
...
...
@@ -144,7 +133,10 @@ if __name__ == '__main__':
if
args
.
sample
:
sample
(
args
.
load
)
else
:
config
=
get_config
()
if
args
.
load
:
config
.
session_init
=
SaverRestore
(
args
.
load
)
GANTrainer
(
config
)
.
train
()
logger
.
auto_set_dir
()
GANTrainer
(
QueueInput
(
get_data
()),
Model
())
.
train_with_defaults
(
callbacks
=
[
ModelSaver
()],
steps_per_epoch
=
500
,
max_epoch
=
100
,
session_init
=
SaverRestore
(
args
.
load
)
if
args
.
load
else
None
)
examples/GAN/CycleGAN.py
View file @
f0243500
...
...
@@ -9,6 +9,7 @@ import glob
from
six.moves
import
map
,
zip
,
range
import
numpy
as
np
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.utils.viz
import
*
import
tensorpack.tfutils.symbolic_functions
as
symbf
...
...
@@ -217,9 +218,7 @@ if __name__ == '__main__':
data
=
get_data
(
args
.
data
)
data
=
PrintData
(
data
)
config
=
TrainConfig
(
model
=
Model
(),
dataflow
=
data
,
GANTrainer
(
QueueInput
(
data
),
Model
())
.
train_with_defaults
(
callbacks
=
[
ModelSaver
(),
ScheduledHyperParamSetter
(
...
...
@@ -228,7 +227,6 @@ if __name__ == '__main__':
PeriodicTrigger
(
VisualizeTestSet
(),
every_k_epochs
=
3
),
],
max_epoch
=
195
,
steps_per_epoch
=
data
.
size
(),
session_init
=
SaverRestore
(
args
.
load
)
if
args
.
load
else
None
)
GANTrainer
(
config
)
.
train
()
examples/GAN/DCGAN.py
View file @
f0243500
...
...
@@ -8,6 +8,7 @@ import numpy as np
import
os
,
sys
import
argparse
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.utils.viz
import
*
from
tensorpack.tfutils.summary
import
add_moving_summary
...
...
@@ -155,12 +156,11 @@ if __name__ == '__main__':
else
:
assert
args
.
data
logger
.
auto_set_dir
()
config
=
TrainConfig
(
model
=
Model
(
),
dataflow
=
get_data
(
args
.
data
),
GANTrainer
(
input
=
QueueInput
(
get_data
(
args
.
data
)
),
model
=
Model
())
.
train_with_defaults
(
callbacks
=
[
ModelSaver
()],
steps_per_epoch
=
300
,
max_epoch
=
200
,
session_init
=
SaverRestore
(
args
.
load
)
if
args
.
load
else
None
)
GANTrainer
(
config
)
.
train
()
examples/GAN/DiscoGAN-CelebA.py
View file @
f0243500
...
...
@@ -8,6 +8,7 @@ import argparse
from
six.moves
import
map
,
zip
import
numpy
as
np
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.utils.viz
import
*
import
tensorpack.tfutils.symbolic_functions
as
symbf
...
...
@@ -216,14 +217,11 @@ if __name__ == '__main__':
data
=
get_celebA_data
(
args
.
data
,
args
.
style_A
,
args
.
style_B
)
config
=
TrainConfig
(
model
=
Model
(),
dataflow
=
data
,
# train 1 D after 2 G
SeparateGANTrainer
(
QueueInput
(
data
),
Model
(),
d_period
=
3
)
.
train_with_defaults
(
callbacks
=
[
ModelSaver
()],
steps_per_epoch
=
300
,
max_epoch
=
250
,
session_init
=
SaverRestore
(
args
.
load
)
if
args
.
load
else
None
)
# train 1 D after 2 G
SeparateGANTrainer
(
config
,
d_period
=
3
)
.
train
()
examples/GAN/GAN.py
View file @
f0243500
...
...
@@ -6,9 +6,9 @@
import
tensorflow
as
tf
import
numpy
as
np
import
time
from
tensorpack
import
(
Trainer
,
QueueInput
,
ModelDescBase
,
DataFlow
,
StagingInput
Wrapper
,
TowerContext
)
from
tensorpack
import
(
T
owerT
rainer
,
QueueInput
,
ModelDescBase
,
DataFlow
,
StagingInput
,
TowerContext
,
TowerFuncWrapper
)
from
tensorpack.graph_builder
import
DataParallelBuilder
,
LeastLoadedDeviceSetter
from
tensorpack.tfutils.summary
import
add_moving_summary
from
tensorpack.utils.argtools
import
memoized
...
...
@@ -64,20 +64,17 @@ class GANModelDesc(ModelDescBase):
return
self
.
_get_optimizer
()
class
GANTrainer
(
Trainer
):
def
__init__
(
self
,
config
):
"""
GANTrainer expects a ModelDesc in config which sets the following attribute
after :meth:`_build_graph`: g_loss, d_loss, g_vars, d_vars.
"""
input
=
QueueInput
(
config
.
dataflow
)
model
=
config
.
model
cbs
=
input
.
setup
(
model
.
get_inputs_desc
())
config
.
callbacks
.
extend
(
cbs
)
class
GANTrainer
(
TowerTrainer
):
def
__init__
(
self
,
input
,
model
):
super
(
GANTrainer
,
self
)
.
__init__
()
assert
isinstance
(
model
,
GANModelDesc
),
model
inputs_desc
=
model
.
get_inputs_desc
()
cbs
=
input
.
setup
(
inputs_desc
)
tower_func
=
TowerFuncWrapper
(
model
.
build_graph
,
inputs_desc
)
with
TowerContext
(
''
,
is_training
=
True
):
model
.
build_graph
(
input
)
tower_func
(
*
input
.
get_input_tensors
()
)
opt
=
model
.
get_optimizer
()
# by default, run one d_min after one g_min
...
...
@@ -86,29 +83,29 @@ class GANTrainer(Trainer):
with
tf
.
control_dependencies
([
g_min
]):
d_min
=
opt
.
minimize
(
model
.
d_loss
,
var_list
=
model
.
d_vars
,
name
=
'd_op'
)
self
.
train_op
=
d_min
self
.
set_tower_func
(
tower_func
)
super
(
GANTrainer
,
self
)
.
__init__
(
config
)
for
cb
in
cbs
:
self
.
_register_callback
(
cb
)
class
SeparateGANTrainer
(
Trainer
):
""" A GAN trainer which runs two optimization ops with a certain ratio
, one in each step.
"""
def
__init__
(
self
,
config
,
d_period
=
1
,
g_period
=
1
):
class
SeparateGANTrainer
(
T
owerT
rainer
):
""" A GAN trainer which runs two optimization ops with a certain ratio
.
"""
def
__init__
(
self
,
input
,
model
,
d_period
=
1
,
g_period
=
1
):
"""
Args:
d_period(int): period of each d_opt run
g_period(int): period of each g_opt run
"""
super
(
SeparateGANTrainer
,
self
)
.
__init__
()
self
.
_d_period
=
int
(
d_period
)
self
.
_g_period
=
int
(
g_period
)
assert
min
(
d_period
,
g_period
)
==
1
input
=
QueueInput
(
config
.
dataflow
)
model
=
config
.
model
cbs
=
input
.
setup
(
model
.
get_inputs_desc
())
config
.
callbacks
.
extend
(
cbs
)
tower_func
=
TowerFuncWrapper
(
model
.
build_graph
,
model
.
get_inputs_desc
()
)
with
TowerContext
(
''
,
is_training
=
True
):
model
.
build_graph
(
input
)
tower_func
(
*
input
.
get_input_tensors
()
)
opt
=
model
.
get_optimizer
()
with
tf
.
name_scope
(
'optimize'
):
...
...
@@ -117,7 +114,9 @@ class SeparateGANTrainer(Trainer):
self
.
g_min
=
opt
.
minimize
(
model
.
g_loss
,
var_list
=
model
.
g_vars
,
name
=
'g_min'
)
super
(
SeparateGANTrainer
,
self
)
.
__init__
(
config
)
self
.
set_tower_func
(
tower_func
)
for
cb
in
cbs
:
self
.
_register_callback
(
cb
)
def
run_step
(
self
):
if
self
.
global_step
%
(
self
.
_d_period
)
==
0
:
...
...
@@ -126,26 +125,28 @@ class SeparateGANTrainer(Trainer):
self
.
hooked_sess
.
run
(
self
.
g_min
)
class
MultiGPUGANTrainer
(
Trainer
):
class
MultiGPUGANTrainer
(
T
owerT
rainer
):
"""
A replacement of GANTrainer (optimize d and g one by one) with multi-gpu support.
"""
def
__init__
(
self
,
config
):
nr_gpu
=
config
.
nr_tower
def
__init__
(
self
,
nr_gpu
,
input
,
model
):
super
(
MultiGPUGANTrainer
,
self
)
.
__init__
()
assert
nr_gpu
>
1
raw_devices
=
[
'/gpu:{}'
.
format
(
k
)
for
k
in
config
.
tower
]
raw_devices
=
[
'/gpu:{}'
.
format
(
k
)
for
k
in
range
(
nr_gpu
)
]
# setup input
input
=
StagingInputWrapper
(
QueueInput
(
config
.
dataflow
),
config
.
tower
)
model
=
config
.
model
input
=
StagingInput
(
input
,
list
(
range
(
nr_gpu
)))
cbs
=
input
.
setup
(
model
.
get_inputs_desc
())
config
.
callbacks
.
extend
(
cbs
)
def
get_cost
():
model
.
build_graph
(
input
)
def
get_cost
(
*
inputs
):
model
.
build_graph
(
input
s
)
return
[
model
.
d_loss
,
model
.
g_loss
]
tower_func
=
TowerFuncWrapper
(
get_cost
,
model
.
get_inputs_desc
())
devices
=
[
LeastLoadedDeviceSetter
(
d
,
raw_devices
)
for
d
in
raw_devices
]
cost_list
=
DataParallelBuilder
.
build_on_towers
(
config
.
tower
,
get_cost
,
devices
)
cost_list
=
DataParallelBuilder
.
build_on_towers
(
list
(
range
(
nr_gpu
)),
lambda
:
tower_func
(
*
input
.
get_input_tensors
()),
devices
)
# simply average the cost. It might get faster to average the gradients
with
tf
.
name_scope
(
'optimize'
):
d_loss
=
tf
.
add_n
([
x
[
0
]
for
x
in
cost_list
])
*
(
1.0
/
nr_gpu
)
...
...
@@ -159,7 +160,9 @@ class MultiGPUGANTrainer(Trainer):
d_min
=
opt
.
minimize
(
d_loss
,
var_list
=
model
.
d_vars
,
colocate_gradients_with_ops
=
True
,
name
=
'd_op'
)
self
.
train_op
=
d_min
super
(
MultiGPUGANTrainer
,
self
)
.
__init__
(
config
)
self
.
set_tower_func
(
tower_func
)
for
cb
in
cbs
:
self
.
_register_callback
(
cb
)
class
RandomZData
(
DataFlow
):
...
...
examples/GAN/Image2Image.py
View file @
f0243500
...
...
@@ -12,6 +12,7 @@ import os
import
sys
import
argparse
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.utils.viz
import
*
from
tensorpack.tfutils.summary
import
add_moving_summary
...
...
@@ -168,21 +169,6 @@ def get_data():
return
ds
def
get_config
():
logger
.
auto_set_dir
()
dataset
=
get_data
()
return
TrainConfig
(
dataflow
=
dataset
,
callbacks
=
[
PeriodicTrigger
(
ModelSaver
(),
every_k_epochs
=
3
),
ScheduledHyperParamSetter
(
'learning_rate'
,
[(
200
,
1e-4
)])
],
model
=
Model
(),
steps_per_epoch
=
dataset
.
size
(),
max_epoch
=
300
,
)
def
sample
(
datadir
,
model_path
):
pred
=
PredictConfig
(
session_init
=
get_model_loader
(
model_path
),
...
...
@@ -218,9 +204,19 @@ if __name__ == '__main__':
BATCH
=
args
.
batch
if
args
.
sample
:
assert
args
.
load
sample
(
args
.
data
,
args
.
load
)
else
:
config
=
get_config
()
if
args
.
load
:
config
.
session_init
=
SaverRestore
(
args
.
load
)
GANTrainer
(
config
)
.
train
()
logger
.
auto_set_dir
()
data
=
QueueInput
(
get_data
())
GANTrainer
(
data
,
Model
())
.
train_with_defaults
(
callbacks
=
[
PeriodicTrigger
(
ModelSaver
(),
every_k_epochs
=
3
),
ScheduledHyperParamSetter
(
'learning_rate'
,
[(
200
,
1e-4
)])
],
steps_per_epoch
=
data
.
size
(),
max_epoch
=
300
,
session_init
=
SaverRestore
(
args
.
load
)
if
args
.
load
else
None
)
examples/GAN/Improved-WGAN.py
View file @
f0243500
...
...
@@ -6,6 +6,7 @@
import
os
import
argparse
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.tfutils.summary
import
add_moving_summary
from
tensorpack.utils.globvars
import
globalns
as
G
...
...
@@ -94,12 +95,11 @@ if __name__ == '__main__':
else
:
assert
args
.
data
logger
.
auto_set_dir
()
config
=
TrainConfig
(
model
=
Model
(
),
dataflow
=
DCGAN
.
get_data
(
args
.
data
),
SeparateGANTrainer
(
QueueInput
(
DCGAN
.
get_data
(
args
.
data
)
),
Model
(),
g_period
=
6
)
.
train_with_defaults
(
callbacks
=
[
ModelSaver
()],
steps_per_epoch
=
300
,
max_epoch
=
200
,
session_init
=
SaverRestore
(
args
.
load
)
if
args
.
load
else
None
)
SeparateGANTrainer
(
config
,
g_period
=
6
)
.
train
()
examples/GAN/InfoGAN-mnist.py
View file @
f0243500
...
...
@@ -10,6 +10,7 @@ import os
import
sys
import
argparse
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.utils
import
viz
from
tensorpack.tfutils.scope_utils
import
auto_reuse_variable_scope
,
under_name_scope
...
...
@@ -189,17 +190,6 @@ def get_data():
return
ds
def
get_config
():
logger
.
auto_set_dir
(
'd'
)
return
TrainConfig
(
dataflow
=
get_data
(),
callbacks
=
[
ModelSaver
(
keep_freq
=
0.1
)],
model
=
Model
(),
steps_per_epoch
=
500
,
max_epoch
=
100
,
)
def
sample
(
model_path
):
pred
=
OfflinePredictor
(
PredictConfig
(
session_init
=
get_model_loader
(
model_path
),
...
...
@@ -254,7 +244,11 @@ if __name__ == '__main__':
BATCH
=
100
sample
(
args
.
load
)
else
:
config
=
get_config
()
if
args
.
load
:
config
.
session_init
=
SaverRestore
(
args
.
load
)
GANTrainer
(
config
)
.
train
()
logger
.
auto_set_dir
()
GANTrainer
(
QueueInput
(
get_data
()),
Model
())
.
train_with_defaults
(
callbacks
=
[
ModelSaver
(
keep_freq
=
0.1
)],
steps_per_epoch
=
500
,
max_epoch
=
100
,
session_init
=
SaverRestore
(
args
.
load
)
if
args
.
load
else
None
)
examples/GAN/WGAN.py
View file @
f0243500
...
...
@@ -6,6 +6,7 @@
import
os
import
argparse
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.tfutils
import
optimizer
from
tensorpack.tfutils.summary
import
add_moving_summary
...
...
@@ -75,14 +76,15 @@ if __name__ == '__main__':
else
:
assert
args
.
data
logger
.
auto_set_dir
()
config
=
TrainConfig
(
# The original code uses a different schedule, but this seems to work well.
# Train 1 D after 2 G
SeparateGANTrainer
(
input
=
QueueInput
(
DCGAN
.
get_data
(
args
.
data
)),
model
=
Model
(),
d
ataflow
=
DCGAN
.
get_data
(
args
.
data
),
d
_period
=
3
)
.
train_with_defaults
(
callbacks
=
[
ModelSaver
(),
ClipCallback
()],
steps_per_epoch
=
500
,
max_epoch
=
200
,
session_init
=
SaverRestore
(
args
.
load
)
if
args
.
load
else
None
)
# The original code uses a different schedule, but this seems to work well.
# Train 1 D after 2 G
SeparateGANTrainer
(
config
,
d_period
=
3
)
.
train
()
examples/HED/hed.py
View file @
f0243500
...
...
@@ -11,6 +11,7 @@ from six.moves import zip
import
os
import
sys
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
import
tensorpack.tfutils.symbolic_functions
as
symbf
from
tensorpack.dataflow
import
dataset
...
...
@@ -231,5 +232,6 @@ if __name__ == '__main__':
config
=
get_config
()
if
args
.
load
:
config
.
session_init
=
get_model_loader
(
args
.
load
)
config
.
nr_tower
=
max
(
get_nr_gpu
(),
1
)
SyncMultiGPUTrainer
(
config
)
.
train
()
launch_train_with_config
(
config
,
SyncMultiGPUTrainer
(
max
(
get_nr_gpu
(),
1
)))
examples/Inception/inception-bn.py
View file @
f0243500
...
...
@@ -9,6 +9,7 @@ import numpy as np
import
os
import
tensorflow
as
tf
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.tfutils.symbolic_functions
import
*
from
tensorpack.tfutils.summary
import
*
...
...
@@ -192,6 +193,6 @@ if __name__ == '__main__':
if
args
.
load
:
config
.
session_init
=
SaverRestore
(
args
.
load
)
if
args
.
gpu
:
config
.
nr_tower
=
len
(
args
.
gpu
.
split
(
','
))
assert
config
.
nr_tower
==
NR_GPU
SyncMultiGPUTrainer
(
config
)
.
train
(
)
nr_tower
=
len
(
args
.
gpu
.
split
(
','
))
assert
nr_tower
==
NR_GPU
launch_train_with_config
(
config
,
SyncMultiGPUTrainer
(
NR_GPU
)
)
examples/Inception/inceptionv3.py
View file @
f0243500
...
...
@@ -10,6 +10,7 @@ import os
import
tensorflow
as
tf
import
multiprocessing
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.tfutils.symbolic_functions
import
*
from
tensorpack.tfutils.summary
import
*
...
...
@@ -298,5 +299,4 @@ if __name__ == '__main__':
config
=
get_config
()
if
args
.
load
:
config
.
session_init
=
SaverRestore
(
args
.
load
)
config
.
nr_tower
=
NR_GPU
SyncMultiGPUTrainer
(
config
)
.
train
()
launch_train_with_config
(
config
,
SyncMultiGPUTrainer
(
NR_GPU
))
examples/PennTreebank/PTB-LSTM.py
View file @
f0243500
...
...
@@ -7,6 +7,7 @@ import numpy as np
import
os
import
argparse
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.tfutils.gradproc
import
*
from
tensorpack.tfutils
import
optimizer
,
summary
...
...
@@ -174,4 +175,4 @@ if __name__ == '__main__':
config
=
get_config
()
if
args
.
load
:
config
.
session_init
=
SaverRestore
(
args
.
load
)
SimpleTrainer
(
config
)
.
train
(
)
launch_train_with_config
(
config
,
SimpleTrainer
()
)
examples/ResNet/cifar10-resnet.py
View file @
f0243500
...
...
@@ -7,6 +7,7 @@ import numpy as np
import
argparse
import
os
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.tfutils.symbolic_functions
import
*
from
tensorpack.tfutils.summary
import
*
...
...
@@ -171,7 +172,7 @@ if __name__ == '__main__':
[(
1
,
0.1
),
(
82
,
0.01
),
(
123
,
0.001
),
(
300
,
0.0002
)])
],
max_epoch
=
400
,
nr_tower
=
max
(
get_nr_gpu
(),
1
),
session_init
=
SaverRestore
(
args
.
load
)
if
args
.
load
else
None
)
SyncMultiGPUTrainerParameterServer
(
config
)
.
train
()
nr_gpu
=
max
(
get_nr_gpu
(),
1
)
launch_train_with_config
(
config
,
SyncMultiGPUTrainerParameterServer
(
nr_gpu
))
examples/ResNet/imagenet-resnet.py
View file @
f0243500
...
...
@@ -9,10 +9,12 @@ import os
import
tensorflow
as
tf
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
logger
,
QueueInput
from
tensorpack.models
import
*
from
tensorpack.callbacks
import
*
from
tensorpack.train
import
TrainConfig
,
SyncMultiGPUTrainerParameterServer
from
tensorpack.train
import
(
TrainConfig
,
SyncMultiGPUTrainerParameterServer
,
launch_train_with_config
)
from
tensorpack.dataflow
import
imgaug
,
FakeData
from
tensorpack.tfutils
import
argscope
,
get_model_loader
from
tensorpack.utils.gpu
import
get_nr_gpu
...
...
@@ -132,4 +134,5 @@ if __name__ == '__main__':
config
=
get_config
(
model
,
fake
=
args
.
fake
)
if
args
.
load
:
config
.
session_init
=
get_model_loader
(
args
.
load
)
SyncMultiGPUTrainerParameterServer
(
config
)
.
train
()
trainer
=
SyncMultiGPUTrainerParameterServer
(
max
(
get_nr_gpu
(),
1
))
launch_train_with_config
(
config
,
trainer
)
examples/ResNet/load-resnet.py
View file @
f0243500
...
...
@@ -152,7 +152,7 @@ def convert_param_name(param):
for
k
,
v
in
six
.
iteritems
(
param
):
try
:
newname
=
name_conversion
(
k
)
except
:
except
Exception
:
logger
.
error
(
"Exception when processing caffe layer {}"
.
format
(
k
))
raise
logger
.
info
(
"Name Transform: "
+
k
+
' --> '
+
newname
)
...
...
examples/ResNet/svhn-resnet.py
deleted
100755 → 0
View file @
d38d22bf
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# File: svhn-resnet.py
# Author: Yuxin Wu <ppwwyyxxc@gmail.com>
import
argparse
import
numpy
as
np
import
os
from
tensorpack
import
*
from
tensorpack.tfutils.symbolic_functions
import
*
from
tensorpack.tfutils.summary
import
*
from
tensorpack.dataflow
import
dataset
from
tensorpack.utils.gpu
import
get_nr_gpu
import
tensorflow
as
tf
"""
ResNet-110 for SVHN Digit Classification.
Reach 1.8
%
validation error after 70 epochs, with 2 TitanX. 2it/s.
You might need to adjust the learning rate schedule when running with 1 GPU.
"""
import
imp
cifar_example
=
imp
.
load_source
(
'cifar_example'
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'cifar10-resnet.py'
))
Model
=
cifar_example
.
Model
BATCH_SIZE
=
128
def
get_data
(
train_or_test
):
isTrain
=
train_or_test
==
'train'
pp_mean
=
dataset
.
SVHNDigit
.
get_per_pixel_mean
()
if
isTrain
:
d1
=
dataset
.
SVHNDigit
(
'train'
)
d2
=
dataset
.
SVHNDigit
(
'extra'
)
ds
=
RandomMixData
([
d1
,
d2
])
else
:
ds
=
dataset
.
SVHNDigit
(
'test'
)
if
isTrain
:
augmentors
=
[
imgaug
.
CenterPaste
((
40
,
40
)),
imgaug
.
Brightness
(
10
),
imgaug
.
Contrast
((
0.8
,
1.2
)),
imgaug
.
GaussianDeform
(
# this is slow. without it, can only reach 1.9% error
[(
0.2
,
0.2
),
(
0.2
,
0.8
),
(
0.8
,
0.8
),
(
0.8
,
0.2
)],
(
40
,
40
),
0.2
,
3
),
imgaug
.
RandomCrop
((
32
,
32
)),
imgaug
.
MapImage
(
lambda
x
:
x
-
pp_mean
),
]
else
:
augmentors
=
[
imgaug
.
MapImage
(
lambda
x
:
x
-
pp_mean
)
]
ds
=
AugmentImageComponent
(
ds
,
augmentors
)
ds
=
BatchData
(
ds
,
128
,
remainder
=
not
isTrain
)
if
isTrain
:
ds
=
PrefetchData
(
ds
,
5
,
5
)
return
ds
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--gpu'
,
help
=
'comma separated list of GPU(s) to use.'
)
parser
.
add_argument
(
'--load'
,
help
=
'load model'
)
args
=
parser
.
parse_args
()
if
args
.
gpu
:
os
.
environ
[
'CUDA_VISIBLE_DEVICES'
]
=
args
.
gpu
logger
.
auto_set_dir
()
dataset_train
=
get_data
(
'train'
)
dataset_test
=
get_data
(
'test'
)
config
=
TrainConfig
(
model
=
Model
(
n
=
18
),
dataflow
=
dataset_train
,
callbacks
=
[
ModelSaver
(),
InferenceRunner
(
dataset_test
,
[
ScalarStats
(
'cost'
),
ClassificationError
()]),
ScheduledHyperParamSetter
(
'learning_rate'
,
[(
1
,
0.1
),
(
20
,
0.01
),
(
28
,
0.001
),
(
50
,
0.0001
)])
],
nr_tower
=
max
(
get_nr_gpu
(),
1
),
session_init
=
SaverRestore
(
args
.
load
)
if
args
.
load
else
None
,
max_epoch
=
500
,
)
SyncMultiGPUTrainerParameterServer
(
config
)
.
train
()
examples/Saliency/CAM-resnet.py
View file @
f0243500
...
...
@@ -9,6 +9,7 @@ import numpy as np
import
os
import
multiprocessing
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
import
tensorflow
as
tf
from
tensorflow.contrib.layers
import
variance_scaling_initializer
from
tensorpack
import
*
...
...
@@ -19,9 +20,10 @@ from tensorpack.tfutils.summary import *
from
tensorpack.utils.gpu
import
get_nr_gpu
from
tensorpack.utils
import
viz
from
imagenet_resnet_utils
import
(
fbresnet_augmentor
,
preresnet_basicblock
,
preresnet_group
,
image_preprocess
,
compute_loss_and_error
)
from
imagenet_utils
import
(
fbresnet_augmentor
,
image_preprocess
,
compute_loss_and_error
)
from
resnet_model
import
(
preresnet_basicblock
,
preresnet_group
)
TOTAL_BATCH_SIZE
=
256
...
...
@@ -90,10 +92,6 @@ def get_data(train_or_test):
def
get_config
():
nr_gpu
=
get_nr_gpu
()
global
BATCH_SIZE
BATCH_SIZE
=
TOTAL_BATCH_SIZE
//
nr_gpu
dataset_train
=
get_data
(
'train'
)
dataset_val
=
get_data
(
'val'
)
...
...
@@ -111,7 +109,6 @@ def get_config():
],
steps_per_epoch
=
5000
,
max_epoch
=
105
,
nr_tower
=
nr_gpu
)
...
...
@@ -163,6 +160,9 @@ if __name__ == '__main__':
if
args
.
gpu
:
os
.
environ
[
'CUDA_VISIBLE_DEVICES'
]
=
args
.
gpu
nr_gpu
=
get_nr_gpu
()
BATCH_SIZE
=
TOTAL_BATCH_SIZE
//
nr_gpu
if
args
.
cam
:
BATCH_SIZE
=
128
# something that can run on one gpu
viz_cam
(
args
.
load
,
args
.
data
)
...
...
@@ -172,4 +172,4 @@ if __name__ == '__main__':
config
=
get_config
()
if
args
.
load
:
config
.
session_init
=
get_model_loader
(
args
.
load
)
SyncMultiGPUTrainerParameterServer
(
config
)
.
train
(
)
launch_train_with_config
(
config
,
SyncMultiGPUTrainerParameterServer
(
nr_gpu
)
)
examples/Saliency/imagenet_resnet_utils.py
deleted
120000 → 0
View file @
d38d22bf
../
ResNet
/
imagenet_resnet_utils
.
py
\ No newline at end of file
examples/Saliency/imagenet_utils.py
0 → 120000
View file @
f0243500
../
ResNet
/
imagenet_utils
.
py
\ No newline at end of file
examples/Saliency/resnet_model.py
0 → 120000
View file @
f0243500
../
ResNet
/
resnet_model
.
py
\ No newline at end of file
examples/ShuffleNet/shufflenet.py
View file @
f0243500
...
...
@@ -10,10 +10,11 @@ import cv2
import
tensorflow
as
tf
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
logger
,
QueueInput
,
InputDesc
,
PlaceholderInput
,
TowerContext
from
tensorpack.models
import
*
from
tensorpack.callbacks
import
*
from
tensorpack.train
import
TrainConfig
,
SyncMultiGPUTrainerParameterServer
from
tensorpack.train
import
*
from
tensorpack.dataflow
import
imgaug
from
tensorpack.tfutils
import
argscope
,
get_model_loader
from
tensorpack.tfutils.scope_utils
import
under_name_scope
...
...
@@ -141,8 +142,7 @@ def get_data(name, batch):
args
.
data
,
name
,
batch
,
augmentors
)
def
get_config
(
model
):
nr_tower
=
max
(
get_nr_gpu
(),
1
)
def
get_config
(
model
,
nr_tower
):
batch
=
TOTAL_BATCH_SIZE
//
nr_tower
logger
.
info
(
"Running on {} towers. Batch size per tower: {}"
.
format
(
nr_tower
,
batch
))
...
...
@@ -170,7 +170,6 @@ def get_config(model):
callbacks
=
callbacks
,
steps_per_epoch
=
5000
,
max_epoch
=
100
,
nr_tower
=
nr_tower
)
...
...
@@ -205,5 +204,6 @@ if __name__ == '__main__':
logger
.
set_logger_dir
(
os
.
path
.
join
(
'train_log'
,
'shufflenet'
))
config
=
get_config
(
model
)
SyncMultiGPUTrainerParameterServer
(
config
)
.
train
()
nr_tower
=
max
(
get_nr_gpu
(),
1
)
config
=
get_config
(
model
,
nr_tower
)
launch_train_with_config
(
config
,
SyncMultiGPUTrainerParameterServer
(
nr_tower
))
examples/SimilarityLearning/mnist-embeddings.py
View file @
f0243500
...
...
@@ -9,7 +9,7 @@ import argparse
import
tensorflow
as
tf
import
tensorflow.contrib.slim
as
slim
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
import
tensorpack.tfutils.symbolic_functions
as
symbf
from
tensorpack.tfutils.summary
import
add_moving_summary
...
...
@@ -442,4 +442,4 @@ if __name__ == '__main__':
if
args
.
load
:
config
.
session_init
=
SaverRestore
(
args
.
load
)
else
:
SimpleTrainer
(
config
)
.
train
(
)
launch_train_with_config
(
config
,
SimpleTrainer
()
)
examples/SpatialTransformer/mnist-addition.py
View file @
f0243500
...
...
@@ -10,6 +10,7 @@ import os
import
sys
import
argparse
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.dataflow
import
dataset
from
tensorpack.tfutils
import
sesscreate
,
optimizer
,
summary
...
...
@@ -186,4 +187,4 @@ if __name__ == '__main__':
config
=
get_config
()
if
args
.
load
:
config
.
session_init
=
SaverRestore
(
args
.
load
)
SimpleTrainer
(
config
)
.
train
(
)
launch_train_with_config
(
config
,
SimpleTrainer
()
)
examples/boilerplate.py
View file @
f0243500
...
...
@@ -5,6 +5,7 @@
import
os
import
argparse
import
tensorflow
as
tf
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
"""
...
...
@@ -51,7 +52,7 @@ def get_config():
return
TrainConfig
(
model
=
Model
(),
data
flow
=
ds_train
,
data
=
QueueInput
(
ds_train
)
,
callbacks
=
[
ModelSaver
(),
InferenceRunner
(
ds_test
,
[
ScalarStats
(
'total_costs'
)]),
...
...
@@ -77,4 +78,4 @@ if __name__ == '__main__':
if
args
.
load
:
config
.
session_init
=
SaverRestore
(
args
.
load
)
SyncMultiGPUTrainer
(
config
)
.
train
(
)
launch_train_with_config
(
config
,
SimpleTrainer
()
)
examples/cifar-convnet.py
View file @
f0243500
...
...
@@ -2,12 +2,13 @@
# -*- coding: UTF-8 -*-
# File: cifar-convnet.py
# Author: Yuxin Wu <ppwwyyxxc@gmail.com>
from
tensorpack
import
*
import
tensorflow
as
tf
import
argparse
import
numpy
as
np
import
os
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
import
tensorpack.tfutils.symbolic_functions
as
symbf
from
tensorpack.tfutils.summary
import
*
from
tensorpack.dataflow
import
dataset
...
...
@@ -151,8 +152,7 @@ if __name__ == '__main__':
if
args
.
load
:
config
.
session_init
=
SaverRestore
(
args
.
load
)
config
.
nr_tower
=
max
(
len
(
args
.
gpu
.
split
(
','
)),
1
)
if
config
.
nr_tower
<=
1
:
QueueInputTrainer
(
config
)
.
train
()
else
:
SyncMultiGPUTrainerParameterServer
(
config
)
.
train
()
nr_gpu
=
len
(
args
.
gpu
.
split
(
','
))
trainer
=
QueueInputTrainer
()
if
nr_gpu
<=
1
\
else
SyncMultiGPUTrainerParameterServer
(
nr_gpu
)
launch_train_with_config
(
config
,
trainer
)
examples/mnist-convnet.py
View file @
f0243500
...
...
@@ -12,6 +12,7 @@ MNIST ConvNet example.
about 0.6
%
validation error after 30 epochs.
"""
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
# Just import everything into current namespace
from
tensorpack
import
*
from
tensorpack.tfutils
import
summary
...
...
@@ -142,4 +143,4 @@ if __name__ == '__main__':
config
.
session_init
=
SaverRestore
(
args
.
load
)
# SimpleTrainer is slow, this is just a demo.
# You can use QueueInputTrainer instead
SimpleTrainer
(
config
)
.
train
(
)
launch_train_with_config
(
config
,
SimpleTrainer
()
)
examples/mnist-tfslim.py
View file @
f0243500
...
...
@@ -14,6 +14,7 @@ the only differences are:
2. use slim names to summarize weights
"""
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.dataflow
import
dataset
import
tensorflow
as
tf
...
...
@@ -101,4 +102,4 @@ if __name__ == '__main__':
os
.
environ
[
'CUDA_VISIBLE_DEVICES'
]
=
args
.
gpu
config
=
get_config
()
SimpleTrainer
(
config
)
.
train
(
)
launch_train_with_config
(
config
,
SimpleTrainer
()
)
examples/mnist-visualizations.py
View file @
f0243500
...
...
@@ -11,6 +11,7 @@ import argparse
MNIST ConvNet example with weights/activations visualization.
"""
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.dataflow
import
dataset
import
tensorflow
as
tf
...
...
@@ -161,4 +162,4 @@ if __name__ == '__main__':
config
=
get_config
()
if
args
.
load
:
config
.
session_init
=
SaverRestore
(
args
.
load
)
SimpleTrainer
(
config
)
.
train
(
)
launch_train_with_config
(
config
,
SimpleTrainer
()
)
examples/svhn-digit-convnet.py
View file @
f0243500
...
...
@@ -7,6 +7,7 @@ import argparse
import
numpy
as
np
import
os
os
.
environ
[
'TENSORPACK_TRAIN_API'
]
=
'v2'
# will become default soon
from
tensorpack
import
*
from
tensorpack.tfutils.symbolic_functions
import
prediction_incorrect
from
tensorpack.dataflow
import
dataset
...
...
@@ -99,7 +100,7 @@ def get_config():
return
TrainConfig
(
model
=
Model
(),
data
flow
=
data_train
,
data
=
QueueInput
(
data_train
)
,
callbacks
=
[
ModelSaver
(),
InferenceRunner
(
data_test
,
...
...
@@ -125,4 +126,4 @@ if __name__ == '__main__':
config
=
get_config
()
if
args
.
load
:
config
.
session_init
=
SaverRestore
(
args
.
load
)
QueueInputTrainer
(
config
)
.
train
(
)
launch_train_with_config
(
config
,
SimpleTrainer
()
)
examples/tox.ini
View file @
f0243500
[flake8]
max-line-length
=
120
ignore
=
F403,F401,F405,F841,E4
01
ignore
=
F403,F401,F405,F841,E4
,E741,E742,E743
exclude
=
private,
FasterRCNN/utils
tensorpack/__init__.py
View file @
f0243500
...
...
@@ -18,9 +18,9 @@ if _HAS_TF:
# In development. Default to v1
if
_os
.
environ
.
get
(
'TENSORPACK_TRAIN_API'
,
'v1'
)
==
'v2'
:
from
tensorpack.trainv2
import
*
else
:
from
tensorpack.train
import
*
else
:
from
tensorpack.trainv1
import
*
from
tensorpack.graph_builder
import
InputDesc
,
ModelDesc
,
ModelDescBase
from
tensorpack.input_source
import
*
from
tensorpack.predict
import
*
tensorpack/callbacks/inference.py
View file @
f0243500
...
...
@@ -38,7 +38,7 @@ class Inferencer(Callback):
for
k
,
v
in
six
.
iteritems
(
ret
):
try
:
v
=
float
(
v
)
except
:
except
ValueError
:
logger
.
warn
(
"{} returns a non-scalar statistics!"
.
format
(
type
(
self
)
.
__name__
))
continue
else
:
...
...
tensorpack/callbacks/inference_runner.py
View file @
f0243500
...
...
@@ -203,7 +203,7 @@ class DataParallelInferenceRunner(InferenceRunnerBase):
self
.
_input_callbacks
=
Callbacks
(
input_callbacks
)
# InputSource might have hooks which break us.
# e.g. hooks from StagingInput
Wrapper
will force the consumption
# e.g. hooks from StagingInput will force the consumption
# of nr_tower datapoints in every run.
input_hooks
=
self
.
_input_callbacks
.
get_hooks
()
self
.
_hooks
=
[
self
.
_build_hook
(
inf
)
for
inf
in
self
.
infs
]
+
input_hooks
...
...
tensorpack/callbacks/param.py
View file @
f0243500
...
...
@@ -199,7 +199,7 @@ class HumanHyperParamSetter(HyperParamSetter):
dic
=
{
str
(
k
):
float
(
v
)
for
k
,
v
in
lines
}
ret
=
dic
[
self
.
param
.
readable_name
]
return
ret
except
:
except
Exception
:
logger
.
warn
(
"Cannot find {} in {}"
.
format
(
self
.
param
.
readable_name
,
self
.
file_name
))
...
...
tensorpack/dataflow/common.py
View file @
f0243500
...
...
@@ -129,7 +129,7 @@ class BatchData(ProxyDataFlow):
else
:
try
:
tp
=
dt
.
dtype
except
:
except
AttributeError
:
raise
TypeError
(
"Unsupported type to batch: {}"
.
format
(
type
(
dt
)))
try
:
result
.
append
(
...
...
@@ -144,7 +144,7 @@ class BatchData(ProxyDataFlow):
try
:
# open an ipython shell if possible
import
IPython
as
IP
;
IP
.
embed
()
# noqa
except
:
except
ImportError
:
pass
return
result
...
...
tensorpack/dataflow/dataset/ilsvrc.py
View file @
f0243500
...
...
@@ -247,7 +247,7 @@ class ILSVRC12(ILSVRC12Files):
cnt
+=
1
except
KeyboardInterrupt
:
raise
except
:
except
Exception
:
ret
.
append
(
None
)
logger
.
info
(
"{}/{} images have bounding box."
.
format
(
cnt
,
len
(
imglist
)))
return
ret
...
...
tensorpack/dataflow/prefetch.py
View file @
f0243500
...
...
@@ -61,7 +61,7 @@ def _zmq_catch_error(name):
raise
DataFlowTerminated
()
else
:
raise
except
:
except
Exception
:
raise
...
...
@@ -110,7 +110,7 @@ class _MultiProcessZMQDataFlow(DataFlow):
x
.
terminate
()
try
:
print
(
"{} successfully cleaned-up."
.
format
(
type
(
self
)
.
__name__
))
except
:
except
Exception
:
pass
...
...
@@ -347,7 +347,7 @@ class MultiThreadMapData(ProxyDataFlow):
return
# cannot ignore None here. will lead to unsynced send/recv
self
.
outq
.
put
(
self
.
func
(
dp
))
except
:
except
Exception
:
if
self
.
stopped
():
pass
# skip duplicated error messages
else
:
...
...
tensorpack/graph_builder/model_desc.py
View file @
f0243500
...
...
@@ -86,16 +86,25 @@ class ModelDescBase(object):
:returns: a list of InputDesc
"""
def
build_graph
(
self
,
input
s
):
def
build_graph
(
self
,
*
arg
s
):
"""
Build the whole symbolic graph.
Args:
input
s (list[tf.Tensor]): a list of tensors,
arg
s (list[tf.Tensor]): a list of tensors,
that match the list of :class:`InputDesc` defined by ``_get_inputs``.
"""
if
isinstance
(
inputs
,
InputSource
):
inputs
=
inputs
.
get_input_tensors
()
if
len
(
args
)
==
1
:
arg
=
args
[
0
]
if
isinstance
(
arg
,
InputSource
):
inputs
=
arg
.
get_input_tensors
()
# remove in the future?
if
isinstance
(
arg
,
(
list
,
tuple
)):
inputs
=
arg
else
:
inputs
=
[
arg
]
else
:
inputs
=
args
assert
len
(
inputs
)
==
len
(
self
.
get_inputs_desc
()),
\
"Number of inputs passed to the graph != number of inputs defined "
\
"in ModelDesc! ({} != {})"
.
format
(
len
(
inputs
),
len
(
self
.
get_inputs_desc
()))
...
...
@@ -148,14 +157,11 @@ class ModelDesc(ModelDescBase):
def
_get_optimizer
(
self
):
raise
NotImplementedError
()
def
build_graph_get_cost
(
self
,
*
inputs
):
"""
Build the graph from inputs and return the cost tensor.
"""
def
_build_graph_get_cost
(
self
,
*
inputs
):
self
.
build_graph
(
inputs
)
return
self
.
get_cost
()
def
build_graph_get_grads
(
self
,
*
inputs
):
def
_
build_graph_get_grads
(
self
,
*
inputs
):
"""
Build the graph from inputs and return the grads.
This is useful for most of the :class:`GraphBuilder` which expects such a function.
...
...
@@ -164,7 +170,7 @@ class ModelDesc(ModelDescBase):
[(grad, var)]
"""
ctx
=
get_current_tower_context
()
cost
=
self
.
build_graph_get_cost
(
*
inputs
)
cost
=
self
.
_
build_graph_get_cost
(
*
inputs
)
varlist
=
ctx
.
filter_vars_by_vs_name
(
tf
.
trainable_variables
())
opt
=
self
.
get_optimizer
()
...
...
tensorpack/input_source/input_source.py
View file @
f0243500
...
...
@@ -28,7 +28,8 @@ __all__ = ['PlaceholderInput', 'FeedInput',
'QueueInput'
,
'BatchQueueInput'
,
'DummyConstantInput'
,
'TensorInput'
,
'TFDatasetInput'
,
'StagingInputWrapper'
]
'StagingInputWrapper'
,
'StagingInput'
]
class
PlaceholderInput
(
InputSource
):
...
...
@@ -398,7 +399,7 @@ class TFDatasetInput(FeedfreeInput):
return
self
.
_iterator
.
get_next
()
class
StagingInput
Wrapper
(
FeedfreeInput
):
class
StagingInput
(
FeedfreeInput
):
"""
A wrapper around a feedfree input,
to prefetch the input in StagingArea (on GPUs).
...
...
@@ -433,7 +434,7 @@ class StagingInputWrapper(FeedfreeInput):
self
.
_input
=
input
if
not
isinstance
(
towers
[
0
],
int
):
# API changed
log_deprecated
(
"StagingInput
Wrapper
(devices=)"
,
"Use (towers=) instead!"
,
"2018-01-31"
)
log_deprecated
(
"StagingInput(devices=)"
,
"Use (towers=) instead!"
,
"2018-01-31"
)
self
.
_devices
=
towers
else
:
self
.
_devices
=
[
'/gpu:{}'
.
format
(
k
)
for
k
in
towers
]
...
...
@@ -451,7 +452,7 @@ class StagingInputWrapper(FeedfreeInput):
cbs
=
self
.
_input
.
get_callbacks
()
cbs
.
append
(
StagingInput
Wrapper
.
StagingCallback
(
StagingInput
.
StagingCallback
(
self
.
_get_stage_op
(),
self
.
_get_unstage_op
(),
self
.
_nr_stage
))
return
cbs
...
...
@@ -488,3 +489,6 @@ class StagingInputWrapper(FeedfreeInput):
with
self
.
cached_name_scope
():
all_outputs
=
list
(
chain
.
from_iterable
(
self
.
_unstage_ops
))
return
tf
.
group
(
*
all_outputs
)
StagingInputWrapper
=
StagingInput
tensorpack/models/shape_utils.py
View file @
f0243500
...
...
@@ -16,7 +16,7 @@ class StaticDynamicAxis(object):
try
:
st
=
f
(
self
.
static
)
return
StaticDynamicAxis
(
st
,
st
)
except
:
except
TypeError
:
return
StaticDynamicAxis
(
None
,
f
(
self
.
dynamic
))
def
__str__
(
self
):
...
...
@@ -53,7 +53,7 @@ class StaticDynamicShape(object):
self
.
static
[
axis
]
=
st
self
.
dynamic
[
axis
]
=
StaticLazyAxis
(
st
)
return
except
:
except
TypeError
:
pass
self
.
static
[
axis
]
=
None
dyn
=
self
.
dynamic
[
axis
]
...
...
tensorpack/train/base.py
View file @
f0243500
This diff is collapsed.
Click to expand it.
tensorpack/train
v2
/interface.py
→
tensorpack/train/interface.py
View file @
f0243500
...
...
@@ -5,13 +5,13 @@
import
tensorflow
as
tf
from
..input_source
import
(
InputSource
,
FeedInput
,
QueueInput
,
StagingInput
Wrapper
,
DummyConstantInput
)
InputSource
,
FeedInput
,
QueueInput
,
StagingInput
,
DummyConstantInput
)
from
..train.config
import
TrainConfig
from
.
base
import
SingleCostTrainer
from
..train
v1
.config
import
TrainConfig
from
.
tower
import
SingleCostTrainer
from
.trainers
import
SimpleTrainer
,
DistributedTrainerReplicated
__all__
=
[
'launch_train_with_config'
,
'
TrainConfig'
,
'
apply_default_prefetch'
]
__all__
=
[
'launch_train_with_config'
,
'apply_default_prefetch'
]
def
apply_default_prefetch
(
input_source_or_dataflow
,
trainer
,
towers
):
...
...
@@ -36,19 +36,26 @@ def apply_default_prefetch(input_source_or_dataflow, trainer, towers):
assert
not
isinstance
(
trainer
,
SimpleTrainer
)
assert
tf
.
test
.
is_gpu_available
()
if
not
isinstance
(
input
,
(
StagingInput
Wrapper
,
DummyConstantInput
)):
input
=
StagingInput
Wrapper
(
input
,
towers
)
if
not
isinstance
(
input
,
(
StagingInput
,
DummyConstantInput
)):
input
=
StagingInput
(
input
,
towers
)
return
input
def
launch_train_with_config
(
config
,
trainer
):
"""
Train with a :class:`TrainConfig` and a new version of :class:`Trainer`, to
mimic the old training interface.
Train with a :class:`TrainConfig` and a :class:`Trainer`, to
mimic the old training interface. It basically does the following
3 things (and you can easily do them by yourself):
1. Setup the :class:`InputSource` with automatic prefetching,
for `config.data` or `config.dataflow`.
2. Call `trainer.setup_graph` with the :class:`InputSource`,
as well as `config.model`.
3. Call `trainer.train` with rest of the attributes of config.
Args:
config (TrainConfig):
trainer (Trainer): an instance of
the new t
rainer
trainer (Trainer): an instance of
a SingleCostT
rainer
Examples:
...
...
@@ -78,7 +85,7 @@ def launch_train_with_config(config, trainer):
trainer
.
setup_graph
(
inputs_desc
,
input
,
model
.
build_graph_get_cost
,
model
.
get_optimizer
)
model
.
_
build_graph_get_cost
,
model
.
get_optimizer
)
trainer
.
train
(
config
.
callbacks
,
config
.
monitors
,
config
.
session_creator
,
config
.
session_init
,
...
...
tensorpack/train/tower.py
0 → 100644
View file @
f0243500
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: tower.py
import
tensorflow
as
tf
import
six
from
abc
import
abstractmethod
,
ABCMeta
from
..utils.argtools
import
call_only_once
,
memoized
from
..graph_builder.predictor_factory
import
SimplePredictBuilder
from
..input_source
import
PlaceholderInput
from
..predict.base
import
OnlinePredictor
from
..tfutils.tower
import
TowerFuncWrapper
,
get_current_tower_context
from
..tfutils.gradproc
import
FilterNoneGrad
from
.base
import
Trainer
__all__
=
[
'SingleCostTrainer'
,
'TowerTrainer'
]
class
TowerTrainer
(
Trainer
):
"""
Base trainers for models that can be built by calling a tower function under a :class:`TowerContext`.
This is required by some features that replicates the model
automatically, e.g. creating a predictor.
"""
tower_func
=
None
"""
A :class:`TowerFuncWrapper` instance.
A callable which takes some input tensors and builds one replicate of the model.
"""
@
call_only_once
def
set_tower_func
(
self
,
tower_func
):
"""
Args:
tower_func (TowerFuncWrapper)
"""
assert
isinstance
(
tower_func
,
TowerFuncWrapper
),
tower_func
self
.
tower_func
=
tower_func
@
property
def
inputs_desc
(
self
):
"""
Returns:
list[InputDesc]: metainfo about the inputs to the tower.
"""
return
self
.
tower_func
.
inputs_desc
def
get_predictor
(
self
,
input_names
,
output_names
,
device
=
0
):
"""
Returns a callable predictor built under ``TowerContext(is_training=False)``.
Args:
input_names (list), output_names(list): list of names
device (int): build the predictor on device '/gpu:{device}' or use -1 for '/cpu:0'.
Returns:
an :class:`OnlinePredictor`.
"""
assert
self
.
tower_func
is
not
None
,
"Must set tower_func on the trainer to use get_predictor()!"
tower_name
=
'tower-pred-{}'
.
format
(
device
)
if
device
>=
0
else
'tower-pred-cpu'
try
:
tower
=
self
.
tower_func
.
towers
[
tower_name
]
except
KeyError
:
input
=
PlaceholderInput
()
input
.
setup
(
self
.
inputs_desc
)
with
tf
.
variable_scope
(
tf
.
get_variable_scope
(),
reuse
=
True
):
SimplePredictBuilder
(
ns_name
=
tower_name
,
vs_name
=
self
.
_main_tower_vs_name
,
device
=
device
)
.
build
(
input
,
self
.
tower_func
)
tower
=
self
.
tower_func
.
towers
[
tower_name
]
input_tensors
=
tower
.
get_tensors
(
input_names
)
output_tensors
=
tower
.
get_tensors
(
output_names
)
return
OnlinePredictor
(
input_tensors
,
output_tensors
)
@
property
def
_main_tower_vs_name
(
self
):
"""
The vs name for the "main" copy of the model,
to be used to build predictors.
"""
return
""
@
six
.
add_metaclass
(
ABCMeta
)
class
SingleCostTrainer
(
TowerTrainer
):
"""
Base class for single-cost trainer.
Single-cost trainer has a :meth:`setup_graph` method which takes
(inputs_desc, input, get_cost_fn, get_opt_fn), and build the training operations from them.
To use a :class:`SingleCostTrainer` object, call `trainer.setup_graph(...); trainer.train(...)`.
"""
@
call_only_once
def
setup_graph
(
self
,
inputs_desc
,
input
,
get_cost_fn
,
get_opt_fn
):
"""
Responsible for building the main training graph for single-cost training.
Args:
inputs_desc ([InputDesc]):
input (InputSource):
get_cost_fn ([tf.Tensor] -> tf.Tensor): callable, takes some input tenosrs and return a cost tensor.
get_opt_fn (-> tf.train.Optimizer): callable which returns an
optimizer. Will only be called once.
Note:
1. `get_cost_fn` will always be called under a :class:`TowerContext`.
which will contain information abouut reuse,
training/inference, scope name, etc.
2. `get_cost_fn` might get called multiple times for data-parallel training or inference.
3. To respect variable reuse, use `tf.get_variable` instead of
`tf.Variable` in `get_cost_fn`.
"""
get_cost_fn
=
TowerFuncWrapper
(
get_cost_fn
,
inputs_desc
)
get_opt_fn
=
memoized
(
get_opt_fn
)
self
.
set_tower_func
(
get_cost_fn
)
input_callbacks
=
self
.
_setup_input
(
inputs_desc
,
input
)
train_callbacks
=
self
.
_setup_graph
(
input
,
get_cost_fn
,
get_opt_fn
)
internal_callbacks
=
input_callbacks
+
train_callbacks
for
cb
in
internal_callbacks
:
self
.
_register_callback
(
cb
)
# TODO register directly instead of return?
@
abstractmethod
def
_setup_graph
(
self
,
input
,
get_cost_fn
,
get_opt_fn
):
"""
Implement the logic to build the graph, with an :class:`InputSource`
that's been setup already.
Returns:
[Callback]: list of callbacks needed
"""
def
_setup_input
(
self
,
inputs_desc
,
input
):
assert
not
input
.
setup_done
()
return
input
.
setup
(
inputs_desc
)
def
_make_get_grad_fn
(
self
,
input
,
get_cost_fn
,
get_opt_fn
):
"""
Returns:
a get_grad_fn for GraphBuilder to use.
"""
# internal use only
assert
input
.
setup_done
()
def
get_grad_fn
():
ctx
=
get_current_tower_context
()
cost
=
get_cost_fn
(
*
input
.
get_input_tensors
())
varlist
=
ctx
.
filter_vars_by_vs_name
(
tf
.
trainable_variables
())
opt
=
get_opt_fn
()
grads
=
opt
.
compute_gradients
(
cost
,
var_list
=
varlist
,
gate_gradients
=
False
,
colocate_gradients_with_ops
=
True
)
grads
=
FilterNoneGrad
()
.
process
(
grads
)
return
grads
return
get_grad_fn
tensorpack/train
v2
/trainers.py
→
tensorpack/train/trainers.py
View file @
f0243500
...
...
@@ -8,6 +8,7 @@ from ..callbacks.graph import RunOp
from
..tfutils.sesscreate
import
NewSessionCreator
from
..utils
import
logger
from
..utils.argtools
import
map_arg
from
..tfutils
import
get_global_step_var
from
..tfutils.distributed
import
get_distributed_session_creator
from
..tfutils.tower
import
TowerContext
...
...
@@ -20,16 +21,24 @@ from ..graph_builder.training import (
from
..graph_builder.distributed
import
DistributedReplicatedBuilder
from
..graph_builder.utils
import
override_to_local_variable
from
.
base
import
SingleCostTrainer
from
.
tower
import
SingleCostTrainer
__all__
=
[
'SimpleTrainer'
,
'QueueInputTrainer'
,
'SyncMultiGPUTrainer'
,
'SyncMultiGPUTrainerReplicated'
,
'SyncMultiGPUTrainerParameterServer'
,
'AsyncMultiGPUTrainer'
,
'DistributedTrainerReplicated'
]
def
_int_to_range
(
x
):
if
isinstance
(
x
,
int
):
assert
x
>
0
,
x
return
list
(
range
(
x
))
return
x
class
SimpleTrainer
(
SingleCostTrainer
):
"""
Single-GPU single-cost single-tower trainer.
...
...
@@ -53,13 +62,14 @@ class SyncMultiGPUTrainerParameterServer(SingleCostTrainer):
__doc__
=
SyncMultiGPUParameterServerBuilder
.
__doc__
def
__init__
(
self
,
towers
,
ps_device
=
'gpu'
):
@
map_arg
(
gpus
=
_int_to_range
)
def
__init__
(
self
,
gpus
,
ps_device
=
'gpu'
):
"""
Args:
tower
s ([int]): list of GPU ids.
gpu
s ([int]): list of GPU ids.
ps_device: either 'gpu' or 'cpu', where variables are stored. Setting to 'cpu' might help when #gpu>=4
"""
self
.
_builder
=
SyncMultiGPUParameterServerBuilder
(
tower
s
,
ps_device
)
self
.
_builder
=
SyncMultiGPUParameterServerBuilder
(
gpu
s
,
ps_device
)
super
(
SyncMultiGPUTrainerParameterServer
,
self
)
.
__init__
()
def
_setup_graph
(
self
,
input
,
get_cost_fn
,
get_opt_fn
):
...
...
@@ -68,17 +78,29 @@ class SyncMultiGPUTrainerParameterServer(SingleCostTrainer):
return
[]
def
SyncMultiGPUTrainer
(
gpus
):
"""
Return a default multi-GPU trainer, if you don't care about the details.
It may not be the most efficient one for your task.
Args:
gpus (list[int]): list of GPU ids.
"""
return
SyncMultiGPUTrainerParameterServer
(
gpus
,
ps_device
=
'gpu'
)
class
AsyncMultiGPUTrainer
(
SingleCostTrainer
):
__doc__
=
AsyncMultiGPUBuilder
.
__doc__
def
__init__
(
self
,
towers
,
scale_gradient
=
True
):
@
map_arg
(
gpus
=
_int_to_range
)
def
__init__
(
self
,
gpus
,
scale_gradient
=
True
):
"""
Args:
tower
s ([int]): list of GPU ids.
gpu
s ([int]): list of GPU ids.
scale_gradient (bool): if True, will scale each gradient by ``1.0/nr_gpu``.
"""
self
.
_builder
=
AsyncMultiGPUBuilder
(
tower
s
,
scale_gradient
)
self
.
_builder
=
AsyncMultiGPUBuilder
(
gpu
s
,
scale_gradient
)
super
(
AsyncMultiGPUTrainer
,
self
)
.
__init__
()
def
_setup_graph
(
self
,
input
,
get_cost_fn
,
get_opt_fn
):
...
...
@@ -91,12 +113,13 @@ class SyncMultiGPUTrainerReplicated(SingleCostTrainer):
__doc__
=
SyncMultiGPUReplicatedBuilder
.
__doc__
def
__init__
(
self
,
towers
):
@
map_arg
(
gpus
=
_int_to_range
)
def
__init__
(
self
,
gpus
):
"""
Args:
tower
s ([int]): list of GPU ids.
gpu
s ([int]): list of GPU ids.
"""
self
.
_builder
=
SyncMultiGPUReplicatedBuilder
(
tower
s
)
self
.
_builder
=
SyncMultiGPUReplicatedBuilder
(
gpu
s
)
super
(
SyncMultiGPUTrainerReplicated
,
self
)
.
__init__
()
def
_setup_graph
(
self
,
input
,
get_cost_fn
,
get_opt_fn
):
...
...
@@ -113,10 +136,11 @@ class DistributedTrainerReplicated(SingleCostTrainer):
__doc__
=
DistributedReplicatedBuilder
.
__doc__
def
__init__
(
self
,
towers
,
server
):
@
map_arg
(
gpus
=
_int_to_range
)
def
__init__
(
self
,
gpus
,
server
):
"""
Args:
tower
s (list[int]): list of GPU ids.
gpu
s (list[int]): list of GPU ids.
server (tf.train.Server): the server with ps and workers.
The job_name must be 'worker' because 'ps' job doesn't need to
build any graph.
...
...
@@ -127,7 +151,7 @@ class DistributedTrainerReplicated(SingleCostTrainer):
if
self
.
job_name
==
'worker'
:
# ps doesn't build any graph
self
.
_builder
=
DistributedReplicatedBuilder
(
tower
s
,
server
)
self
.
_builder
=
DistributedReplicatedBuilder
(
gpu
s
,
server
)
self
.
is_chief
=
self
.
_builder
.
is_chief
else
:
self
.
is_chief
=
False
...
...
tensorpack/trainv
2
/__init__.py
→
tensorpack/trainv
1
/__init__.py
View file @
f0243500
...
...
@@ -19,7 +19,7 @@ def global_import(name):
_CURR_DIR
=
os
.
path
.
dirname
(
__file__
)
_SKIP
=
[]
_SKIP
=
[
'utility'
]
for
_
,
module_name
,
_
in
iter_modules
(
[
_CURR_DIR
]):
srcpath
=
os
.
path
.
join
(
_CURR_DIR
,
module_name
+
'.py'
)
...
...
tensorpack/trainv
2
/base.py
→
tensorpack/trainv
1
/base.py
View file @
f0243500
This diff is collapsed.
Click to expand it.
tensorpack/train/config.py
→
tensorpack/train
v1
/config.py
View file @
f0243500
...
...
@@ -17,9 +17,21 @@ from ..utils.develop import log_deprecated
__all__
=
[
'TrainConfig'
]
def
DEFAULT_CALLBACKS
():
return
[
MovingAverageSummary
(),
ProgressBar
(),
MergeAllSummaries
(),
RunUpdateOps
()]
def
DEFAULT_MONITORS
():
return
[
TFEventWriter
(),
JSONWriter
(),
ScalarPrinter
()]
class
TrainConfig
(
object
):
"""
Config for trainer
.
A collection of options to be used for trainers
.
"""
def
__init__
(
self
,
...
...
@@ -84,9 +96,9 @@ class TrainConfig(object):
callbacks
=
[]
assert_type
(
callbacks
,
list
)
self
.
_callbacks
=
callbacks
+
\
(
extra_callbacks
or
TrainConfig
.
DEFAULT_EXTRA
_CALLBACKS
())
(
extra_callbacks
or
DEFAULT
_CALLBACKS
())
self
.
monitors
=
monitors
or
TrainConfig
.
DEFAULT_MONITORS
()
self
.
monitors
=
monitors
or
DEFAULT_MONITORS
()
if
session_init
is
None
:
session_init
=
JustCurrentSession
()
...
...
@@ -148,15 +160,3 @@ class TrainConfig(object):
@
property
def
callbacks
(
self
):
# disable setter
return
self
.
_callbacks
@
staticmethod
def
DEFAULT_EXTRA_CALLBACKS
():
return
[
MovingAverageSummary
(),
ProgressBar
(),
MergeAllSummaries
(),
RunUpdateOps
()]
@
staticmethod
def
DEFAULT_MONITORS
():
return
[
TFEventWriter
(),
JSONWriter
(),
ScalarPrinter
()]
tensorpack/train/distributed.py
→
tensorpack/train
v1
/distributed.py
View file @
f0243500
...
...
@@ -64,7 +64,7 @@ class DistributedTrainerReplicated(Trainer):
self
.
_config
.
callbacks
.
extend
(
cbs
)
self
.
train_op
,
initial_sync_op
,
model_sync_op
=
self
.
_builder
.
build
(
lambda
:
self
.
model
.
build_graph_get_grads
(
lambda
:
self
.
model
.
_
build_graph_get_grads
(
*
self
.
_input_source
.
get_input_tensors
()),
self
.
model
.
get_optimizer
)
...
...
tensorpack/trainv1/interface.py
0 → 100644
View file @
f0243500
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: interface.py
__all__
=
[
'launch_train_with_config'
]
def
launch_train_with_config
(
config
,
trainer
):
from
..train.interface
import
launch_train_with_config
as
old_launch
old_launch
(
config
,
trainer
)
tensorpack/train/multigpu.py
→
tensorpack/train
v1
/multigpu.py
View file @
f0243500
...
...
@@ -8,7 +8,7 @@ import tensorflow as tf
from
..callbacks.graph
import
RunOp
from
..utils.develop
import
log_deprecated
from
..input_source
import
QueueInput
,
StagingInput
Wrapper
,
DummyConstantInput
from
..input_source
import
QueueInput
,
StagingInput
,
DummyConstantInput
from
..graph_builder.training
import
(
SyncMultiGPUParameterServerBuilder
,
SyncMultiGPUReplicatedBuilder
,
...
...
@@ -43,8 +43,8 @@ def apply_prefetch_policy(config, gpu_prefetch=True):
assert
tf
.
test
.
is_gpu_available
()
# seem to only improve on >1 GPUs
if
not
isinstance
(
config
.
data
,
(
StagingInput
Wrapper
,
DummyConstantInput
)):
config
.
data
=
StagingInput
Wrapper
(
config
.
data
,
config
.
tower
)
if
not
isinstance
(
config
.
data
,
(
StagingInput
,
DummyConstantInput
)):
config
.
data
=
StagingInput
(
config
.
data
,
config
.
tower
)
class
SyncMultiGPUTrainerParameterServer
(
Trainer
):
...
...
@@ -70,7 +70,7 @@ class SyncMultiGPUTrainerParameterServer(Trainer):
self
.
train_op
=
SyncMultiGPUParameterServerBuilder
(
self
.
_config
.
tower
,
self
.
_ps_device
)
.
build
(
lambda
:
self
.
model
.
build_graph_get_grads
(
lambda
:
self
.
model
.
_
build_graph_get_grads
(
*
self
.
_input_source
.
get_input_tensors
()),
self
.
model
.
get_optimizer
)
...
...
@@ -104,7 +104,7 @@ class SyncMultiGPUTrainerReplicated(Trainer):
self
.
train_op
,
post_init_op
=
SyncMultiGPUReplicatedBuilder
(
self
.
_config
.
tower
)
.
build
(
lambda
:
self
.
model
.
build_graph_get_grads
(
lambda
:
self
.
model
.
_
build_graph_get_grads
(
*
self
.
_input_source
.
get_input_tensors
()),
self
.
model
.
get_optimizer
)
...
...
@@ -134,7 +134,7 @@ class AsyncMultiGPUTrainer(Trainer):
self
.
train_op
=
AsyncMultiGPUBuilder
(
self
.
_config
.
tower
,
self
.
_scale_gradient
)
.
build
(
lambda
:
self
.
model
.
build_graph_get_grads
(
lambda
:
self
.
model
.
_
build_graph_get_grads
(
*
self
.
_input_source
.
get_input_tensors
()),
self
.
model
.
get_optimizer
)
...
...
tensorpack/train/simple.py
→
tensorpack/train
v1
/simple.py
View file @
f0243500
...
...
@@ -43,7 +43,7 @@ class SimpleTrainer(Trainer):
cbs
=
self
.
_input_source
.
setup
(
self
.
model
.
get_inputs_desc
())
with
TowerContext
(
''
,
is_training
=
True
):
grads
=
self
.
model
.
build_graph_get_grads
(
grads
=
self
.
model
.
_
build_graph_get_grads
(
*
self
.
_input_source
.
get_input_tensors
())
opt
=
self
.
model
.
get_optimizer
()
self
.
train_op
=
opt
.
apply_gradients
(
grads
,
name
=
'min_op'
)
...
...
tensorpack/trainv1/utility.py
0 → 100644
View file @
f0243500
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: utility.py
# for backwards-compatibility
from
..graph_builder.utils
import
(
# noqa
OverrideToLocalVariable
,
override_to_local_variable
,
LeastLoadedDeviceSetter
)
tensorpack/user_ops/test-recv-op.py
View file @
f0243500
...
...
@@ -15,7 +15,7 @@ from tensorpack.user_ops.zmq_recv import ( # noqa
try
:
num
=
int
(
sys
.
argv
[
1
])
except
:
except
ValueError
:
num
=
2
ENDPOINT
=
'ipc://test-pipe'
...
...
tensorpack/utils/fs.py
View file @
f0243500
...
...
@@ -53,7 +53,7 @@ def download(url, dir, filename=None):
fpath
,
_
=
urllib
.
request
.
urlretrieve
(
url
,
fpath
,
reporthook
=
hook
(
t
))
statinfo
=
os
.
stat
(
fpath
)
size
=
statinfo
.
st_size
except
:
except
IOError
:
logger
.
error
(
"Failed to download {}"
.
format
(
url
))
raise
assert
size
>
0
,
"Download an empty file!"
...
...
tensorpack/utils/loadcaffe.py
View file @
f0243500
...
...
@@ -135,7 +135,7 @@ def get_caffe_pb():
version
=
version
.
decode
(
'utf-8'
)
version
=
float
(
'.'
.
join
(
version
.
split
(
' '
)[
1
]
.
split
(
'.'
)[:
2
]))
assert
version
>=
2.7
,
"Require protoc>=2.7 for Python3"
except
:
except
Exception
:
logger
.
exception
(
"protoc --version gives: "
+
str
(
version
))
raise
...
...
tox.ini
View file @
f0243500
[flake8]
max-line-length
=
120
ignore
=
E265
ignore
=
E265
,E741,E742,E743
exclude
=
.git,
tensorpack/__init__.py,
setup.py,
snippet,
docs,
examples,
docs/conf.py
snippet,
examples-old,
_test.py,
docs/conf.py
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment