Commit cb1419e8 authored by Yuxin Wu's avatar Yuxin Wu

update docs

parent a6a6cd3e
...@@ -21,34 +21,24 @@ matrix: ...@@ -21,34 +21,24 @@ matrix:
- os: linux - os: linux
python: 2.7 python: 2.7
env: TF_VERSION=1.3.0 env: TF_VERSION=1.3.0
- os: linux
python: 3.6
env: TF_VERSION=1.3.0
- os: linux - os: linux
python: 2.7 python: 2.7
env: TF_VERSION=1.14.0 env: TF_VERSION=1.14.0
- os: linux
python: 3.6
env: TF_VERSION=1.14.0 PYPI=true
- os: linux - os: linux
python: 2.7 python: 2.7
env: TF_VERSION=nightly env: TF_VERSION=nightly
- os: linux
python: 3.6
env: TF_VERSION=nightly
allow_failures: allow_failures:
- env: TF_VERSION=nightly - env: TF_VERSION=nightly
install: install:
- pip install -U pip # the pip version on travis is too old - pip install -U pip # the pip version on travis is too old
- pip install . - pip install .
- pip install flake8 scikit-image opencv-python lmdb h5py msgpack - pip install scikit-image opencv-python lmdb h5py msgpack
# check that dataflow can be imported alone without tensorflow # check that dataflow can be imported alone without tensorflow
- python -c "import tensorpack.dataflow" - python -c "import tensorpack.dataflow"
- ./tests/install-tensorflow.sh - ./tests/install-tensorflow.sh
before_script: before_script:
- flake8 --version
- protoc --version - protoc --version
- python -c "import cv2; print('OpenCV '+ cv2.__version__)" - python -c "import cv2; print('OpenCV '+ cv2.__version__)"
- python -c "import tensorflow as tf; print('TensorFlow '+ tf.__version__)" - python -c "import tensorflow as tf; print('TensorFlow '+ tf.__version__)"
...@@ -56,8 +46,6 @@ before_script: ...@@ -56,8 +46,6 @@ before_script:
- export TENSORPACK_DATASET=$HOME/tensorpack_data - export TENSORPACK_DATASET=$HOME/tensorpack_data
script: script:
- flake8 .
- if [[ $TRAVIS_PYTHON_VERSION == '3.6' ]]; then cd examples && flake8 .; fi # some examples are py3 only
- $TRAVIS_BUILD_DIR/tests/run-tests.sh - $TRAVIS_BUILD_DIR/tests/run-tests.sh
- cd $TRAVIS_BUILD_DIR # go back to root so that deploy may work - cd $TRAVIS_BUILD_DIR # go back to root so that deploy may work
...@@ -76,29 +64,29 @@ notifications: ...@@ -76,29 +64,29 @@ notifications:
# see https://docs.travis-ci.com/user/deployment/pypi/ # see https://docs.travis-ci.com/user/deployment/pypi/
deploy: #deploy:
- provider: pypi #- provider: pypi
user: ppwwyyxx #user: ppwwyyxx
distributions: "sdist bdist_wheel" #distributions: "sdist bdist_wheel"
skip_upload_docs: true #skip_upload_docs: true
password: #password:
secure: lnNfzPFSk4HF7seuu63CoUa0g4V0JPs42H5FTzWecEIro8IqjdaAvzNKuhu9E4rkrMXPzoYYA6gC4YhseT7N/jg7lyV/Nn1koGXK1gmzu5JnXZXHw5/Ri0I3hOF3OFpEvkR39tzV0r5HsO0JIU3pl11+bBS8iloEtzdTPaUaRgyYxyZGrHl8l3VnUyM50PRnPGDouK6DHxJmknnFCOZFCrEpwN4zpOs55c1rChjJ8aql69rSsXQOUP8++jRtGwgqEvHh0K86uGm1AJUYvSWmcoD+5Urcg8LGaQzySmXtJnFgxtzsORactIEJoAteSMXufWZITn4OQ5VQkc4/CfU0HgHioQw86dpdJrfOLkhEx65JxfUsQiOTgpcTOgYYEda6+dY4cnTTpj2fASVDhQMQ/vo+Ab/W14nYG8z0IPwGJ1qDXRO6AtPD7vbah2LdHQTUTiAbbtva7NWuKbFiVxD2tdrVQHKWqmKXw+JF5F8TBDEnUxFtulW2hbM+vwu6mPxbYQZEpONjLKSa7qiZ8jQZ8cy9KatQYkqPLeGEbgI+IsmA4bnJJennToyWZ2N4W071ddtLB7hDH4ZRVdaLVYtfeKW/b/+YGX3N8p5cMKGIDjpGyF0BocFalQ7gYfg2ouAn1RyEPkCaw6ntA2uzIgvTqxU5inWJCFn20Ogst0oIaPs= #secure: lnNfzPFSk4HF7seuu63CoUa0g4V0JPs42H5FTzWecEIro8IqjdaAvzNKuhu9E4rkrMXPzoYYA6gC4YhseT7N/jg7lyV/Nn1koGXK1gmzu5JnXZXHw5/Ri0I3hOF3OFpEvkR39tzV0r5HsO0JIU3pl11+bBS8iloEtzdTPaUaRgyYxyZGrHl8l3VnUyM50PRnPGDouK6DHxJmknnFCOZFCrEpwN4zpOs55c1rChjJ8aql69rSsXQOUP8++jRtGwgqEvHh0K86uGm1AJUYvSWmcoD+5Urcg8LGaQzySmXtJnFgxtzsORactIEJoAteSMXufWZITn4OQ5VQkc4/CfU0HgHioQw86dpdJrfOLkhEx65JxfUsQiOTgpcTOgYYEda6+dY4cnTTpj2fASVDhQMQ/vo+Ab/W14nYG8z0IPwGJ1qDXRO6AtPD7vbah2LdHQTUTiAbbtva7NWuKbFiVxD2tdrVQHKWqmKXw+JF5F8TBDEnUxFtulW2hbM+vwu6mPxbYQZEpONjLKSa7qiZ8jQZ8cy9KatQYkqPLeGEbgI+IsmA4bnJJennToyWZ2N4W071ddtLB7hDH4ZRVdaLVYtfeKW/b/+YGX3N8p5cMKGIDjpGyF0BocFalQ7gYfg2ouAn1RyEPkCaw6ntA2uzIgvTqxU5inWJCFn20Ogst0oIaPs=
on: #on:
tags: true #tags: true
branch: master #branch: master
repo: tensorpack/tensorpack #repo: tensorpack/tensorpack
python: "3.6" #python: "3.6"
condition: "$PYPI = true" #condition: "$PYPI = true"
- provider: pypi #- provider: pypi
server: https://testpypi.python.org/pypi #server: https://testpypi.python.org/pypi
user: ppwwyyxx #user: ppwwyyxx
distributions: "sdist bdist_wheel" #distributions: "sdist bdist_wheel"
skip_upload_docs: true #skip_upload_docs: true
password: #password:
secure: lnNfzPFSk4HF7seuu63CoUa0g4V0JPs42H5FTzWecEIro8IqjdaAvzNKuhu9E4rkrMXPzoYYA6gC4YhseT7N/jg7lyV/Nn1koGXK1gmzu5JnXZXHw5/Ri0I3hOF3OFpEvkR39tzV0r5HsO0JIU3pl11+bBS8iloEtzdTPaUaRgyYxyZGrHl8l3VnUyM50PRnPGDouK6DHxJmknnFCOZFCrEpwN4zpOs55c1rChjJ8aql69rSsXQOUP8++jRtGwgqEvHh0K86uGm1AJUYvSWmcoD+5Urcg8LGaQzySmXtJnFgxtzsORactIEJoAteSMXufWZITn4OQ5VQkc4/CfU0HgHioQw86dpdJrfOLkhEx65JxfUsQiOTgpcTOgYYEda6+dY4cnTTpj2fASVDhQMQ/vo+Ab/W14nYG8z0IPwGJ1qDXRO6AtPD7vbah2LdHQTUTiAbbtva7NWuKbFiVxD2tdrVQHKWqmKXw+JF5F8TBDEnUxFtulW2hbM+vwu6mPxbYQZEpONjLKSa7qiZ8jQZ8cy9KatQYkqPLeGEbgI+IsmA4bnJJennToyWZ2N4W071ddtLB7hDH4ZRVdaLVYtfeKW/b/+YGX3N8p5cMKGIDjpGyF0BocFalQ7gYfg2ouAn1RyEPkCaw6ntA2uzIgvTqxU5inWJCFn20Ogst0oIaPs= #secure: lnNfzPFSk4HF7seuu63CoUa0g4V0JPs42H5FTzWecEIro8IqjdaAvzNKuhu9E4rkrMXPzoYYA6gC4YhseT7N/jg7lyV/Nn1koGXK1gmzu5JnXZXHw5/Ri0I3hOF3OFpEvkR39tzV0r5HsO0JIU3pl11+bBS8iloEtzdTPaUaRgyYxyZGrHl8l3VnUyM50PRnPGDouK6DHxJmknnFCOZFCrEpwN4zpOs55c1rChjJ8aql69rSsXQOUP8++jRtGwgqEvHh0K86uGm1AJUYvSWmcoD+5Urcg8LGaQzySmXtJnFgxtzsORactIEJoAteSMXufWZITn4OQ5VQkc4/CfU0HgHioQw86dpdJrfOLkhEx65JxfUsQiOTgpcTOgYYEda6+dY4cnTTpj2fASVDhQMQ/vo+Ab/W14nYG8z0IPwGJ1qDXRO6AtPD7vbah2LdHQTUTiAbbtva7NWuKbFiVxD2tdrVQHKWqmKXw+JF5F8TBDEnUxFtulW2hbM+vwu6mPxbYQZEpONjLKSa7qiZ8jQZ8cy9KatQYkqPLeGEbgI+IsmA4bnJJennToyWZ2N4W071ddtLB7hDH4ZRVdaLVYtfeKW/b/+YGX3N8p5cMKGIDjpGyF0BocFalQ7gYfg2ouAn1RyEPkCaw6ntA2uzIgvTqxU5inWJCFn20Ogst0oIaPs=
on: #on:
branch: test-travis #branch: test-travis
repo: tensorpack/tensorpack #repo: tensorpack/tensorpack
python: "3.6" #python: "3.6"
condition: "$PYPI = true" #condition: "$PYPI = true"
## Understand Trainer ## Understand Trainer
### Role of Trainer ### How Existing (Single-Cost) Trainers Work
Tensorpack follows the "define-and-run" paradigm. Therefore a training script has two steps: Most neural network training tasks are single-cost optimization.
Tensorpack provides some trainer implementations for such tasks.
These trainers will take care of step 1 (define the graph), with the following arguments:
1. __Define__: Build graph for the model. 1. Some `tf.TensorSpec`, the signature of the input.
Users can call whatever tensorflow functions to setup the graph. 2. An `InputSource`, where the input come from. See [Input Pipeline](input-source.html).
Users may or may not use tensorpack `InputSource`, `ModelDesc` or other utilities to build the graph. 3. A function which takes input tensors and returns the cost.
The goal of this step is to define "what to run" in later training steps, 4. A function which returns an optimizer.
and it can happen __either inside or outside__ tensorpack trainer.
2. __Run__: Train the model (the [Trainer.train() method](/modules/train.html#tensorpack.train.Trainer.train)): These are documented in [SingleCostTrainer.setup_graph](/modules/train.html#tensorpack.train.SingleCostTrainer.setup_graph).
In practice you'll not use this method directly, but use [high-level interface](/tutorial/training-interface.html#with-modeldesc-and-trainconfig) instead.
1. Setup callbacks/monitors.
2. Finalize graph, initialize session.
3. Run the training loop.
### Tower Trainer
### Assumptions of Base Trainer [TowerTrainer](../modules/train.html#tensorpack.train.TowerTrainer)
is a trainer that uses user-provided "tower function" to build models.
All existing trainers in tensorpack are subclass of ``TowerTrainer``,
because this concept is able to cover most types of neural-network training tasks.
* Q: What types of training can you do with tensorpack? #### What is Tower Function
* A: Anything that runs in a loop.
In research we do training of various kind. Following the terminology in TensorFlow,
Tensorpack trainers avoid making assumptions on what type of training a __tower function__ is a callable that takes input tensors and adds __one replicate__ of the model to the graph.
you want to do (e.g., it doesn't have to be batched, SGD-like, or have `X`(inputs) and `y`(outputs)). In short, __tower function builds your model__.
The only assumption is that your training follows this pattern: If you can write a function that builds your model, then you can use `TowerTrainer`.
```python
for epoch_num in range(starting_epoch, max_epoch):
for local_step in range(steps_per_epoch):
run_step()
```
1. Training is **running some iterations**. The concept of "tower" is used mainly to support:
Tensorpack base trainer implements the logic of __running the iteration__. 1. Data-parallel multi-GPU training, where a replicate is built on each GPU.
Users or derived trainers should implement __what the iteration is__. 2. Graph construction for inference, where a replicate is built under inference mode.
2. Trainer assumes the existence of __"epoch"__, i.e. that the iterations run in double for-loops. A user needs to provide a tower function to use `TowerTrainer`.
But `steps_per_epoch` can be any number you set In particular, when working with the commonly used `ModelDesc` interface, the `build_graph`
and it only affects the [schedule of callbacks](callback.html). method will be part of the tower function.
In other words, an "epoch" in tensorpack is the __default period to run callbacks__ (validation, summary, checkpoint, etc.).
#### Rules of Tower Function
### How Existing (Single-Cost) Trainers Work The tower function needs to follow some rules:
Most neural network training tasks are single-cost optimization. 1. __It may get called multiple times__ for data-parallel training or inference. As a result:
Tensorpack provides some trainer implementations for such tasks. * You'll need to be careful when modifying global states, e.g.
These trainers will take care of step 1 (define the graph), with the following arguments: adding ops to collections, setting attributes of a model instance.
* To use a tensorflow-hub module, you need to initialize the
module outside the tower function, and call the module inside the tower function.
2. It must __respect variable collections__:
* (Required) Only put variables __trainable by gradient descent__ into `TRAINABLE_VARIABLES`.
* (Recommended) Put non-trainable variables that need to be used in inference into `MODEL_VARIABLES`.
3. It must __respect variable scope names__:
1. Some `tf.TensorSpec`, the signature of the input. The name of any trainable variables created in the function must be like "variable_scope_name/other/scopes/and/name".
2. An `InputSource`, where the input come from. See [Input Pipeline](input-source.html). Strictly speaking, the name of any trainable variables must:
3. A function which takes input tensors and returns the cost.
4. A function which returns an optimizer.
These are documented in [SingleCostTrainer.setup_graph](/modules/train.html#tensorpack.train.SingleCostTrainer.setup_graph). * Start with the name of the enclosing variable_scope when the tower function is called.
In practice you'll not use this method directly, but use [high-level interface](/tutorial/training-interface.html#with-modeldesc-and-trainconfig) instead. * Not use the same variable_scope's name twice in its name.
* Not depend on name_scope's name.
* Not depend on any tensor's name (because the tensor's name may depend on name_scope's name).
Tensorpack layers create variables based on the name given to the layer:
e.g., `Conv2D('test', x)` will open a variable scope named "test".
In order to respect the above rules,
the name of the layer must not depend on name_scope's name or any tensor's name.
4. It must __respect variable scope reuse__:
* The creation of any trainable variables must __respect reuse__ variable scope.
To respect variable reuse (i.e. sharing), use `tf.get_variable` instead of `tf.Variable` in the function.
On the other hand, for a non-trainable variable, it may be desirable to not reuse it between towers.
In this case, `tf.Variable` can be used to ensure creation of new variables in each tower even when `reuse=True`.
* Do not modify the reuse option (e.g., by `scope.reuse_variables()`) of a variable
scope that is not created by you. This affects other's code. You can always
open new scopes if you need the reuse option.
5. It must not create scopes or variables containing the name 'tower', as it is
reserved for special use.
These conventions are easy to follow, and most layer wrappers (e.g.,
tf.layers/slim/tensorlayer) do follow them. Note that certain Keras layers do not
follow these conventions and will need some workarounds if used within tensorpack.
#### What You Can Do Inside a Tower Function
1. Call any symbolic functions as long as they follow the above rules.
2. The tower function will be called under a
[TowerContext](../modules/tfutils.html#tensorpack.tfutils.tower.BaseTowerContext),
which can be accessed by [get_current_tower_context()](../modules/tfutils.html#tensorpack.tfutils.tower.get_current_tower_context).
The context contains information about training/inference mode, scope name, etc.
You can use the context to build a different graph under different mode.
### Write a Trainer ### Write a Trainer
...@@ -64,7 +95,7 @@ If you just want to do some extra work during training, first consider writing i ...@@ -64,7 +95,7 @@ If you just want to do some extra work during training, first consider writing i
or write an issue to see if there is a better solution than creating new trainers. or write an issue to see if there is a better solution than creating new trainers.
If your task is fundamentally different from single-cost optimization, you will need to write a trainer. If your task is fundamentally different from single-cost optimization, you will need to write a trainer.
You can customize the trainer by either using or inheriting the base `Trainer` class. You can customize the trainer by either using or inheriting the `Trainer`/`TowerTrainer` class.
You will need to do two things for a new Trainer: You will need to do two things for a new Trainer:
1. Define the graph. There are 2 ways you can do this: 1. Define the graph. There are 2 ways you can do this:
...@@ -82,5 +113,6 @@ You will need to do two things for a new Trainer: ...@@ -82,5 +113,6 @@ You will need to do two things for a new Trainer:
to be taken to choose which session to use, because many states to be taken to choose which session to use, because many states
(global steps, StagingArea, summaries) are maintained through `before_run`/`after_run`. (global steps, StagingArea, summaries) are maintained through `before_run`/`after_run`.
If you want to write a new trainer,
There are several different [GAN trainers](../../examples/GAN/GAN.py) for reference. Tensorpack examples include several different
[GAN trainers](../../examples/GAN/GAN.py) for a reference.
# Trainers # Trainers
Tensorpack trainers contain logic of: Tensorpack follows the "define-and-run" paradigm.
Therefore a training contains two steps:
1. Building the graph.
2. Running the iterations (with callbacks).
Usually you won't touch these methods directly, but use
[higher-level interface](training-interface.html) on trainers.
You'll only need to __select__ what trainer to use.
But some basic knowledge of how they work is useful:
### Tower Trainer
[TowerTrainer](../modules/train.html#tensorpack.train.TowerTrainer)
is a trainer that uses "tower function" to build models.
All existing trainers in tensorpack are subclass of ``TowerTrainer``,
because this concept is able to cover most types of neural-network training tasks.
#### What is Tower Function 1. __Define__: Build graph for the model.
Users can call whatever tensorflow functions to setup the graph.
Users may or may not use tensorpack `InputSource`, `ModelDesc` or other utilities to build the graph.
The goal of this step is to define "what to run" in later training steps,
and it can happen __either inside or outside__ tensorpack trainer.
Following the terminology in TensorFlow, 2. __Run__: Train the model (the [Trainer.train() method](/modules/train.html#tensorpack.train.Trainer.train)):
a __tower function__ is a callable that takes input tensors and adds __one replicate__ of the model to the graph.
The concept of tower is used mainly to support: 1. Setup callbacks/monitors.
1. Data-parallel multi-GPU training, where a replicate is built on each GPU. 2. Finalize graph, initialize session.
2. Graph construction for inference, where a replicate is built under inference mode. 3. Run the training loop.
A user needs to provide a tower function to use `TowerTrainer`. Tensorpack `Trainers` aims to simplify the above two steps
In particular, when working with the `ModelDesc` interface, the `build_graph` by exploiting some universal patterns.
method will be part of the tower function.
#### Rules of Tower Function ### Assumptions of Base Trainer
The tower function needs to follow some rules: * Q: What types of training can you do with tensorpack?
* A: Anything that runs in a loop.
1. __It may get called multiple times__ for data-parallel training or inference. As a result: In research we do training of various kind.
* You'll need to be careful when modifying global states, e.g. Tensorpack trainers avoid making assumptions on what type of training
adding ops to collections, setting attributes of a model instance. you want to do. For example, unlike Keras, tensorpack does not wrongly assume that:
* To use a tensorflow-hub module, you need to initialize the 1. Your training is batched
module outside the tower function, and call the module inside the tower function. 2. Your training is gradient-based optimization
2. It must __respect variable collections__: 3. Your data has `X`(inputs) and `y`(outputs)
* (Required) Only put variables __trainable by gradient descent__ into `TRAINABLE_VARIABLES`. 4. You want to evaluate on zero or one validation dataset
* (Recommended) Put non-trainable variables that need to be used in inference into `MODEL_VARIABLES`. 5. ... and more
3. It must __respect variable scope names__:
The name of any trainable variables created in the function must be like "variable_scope_name/other/scopes/and/name". The only assumption is that your training follows this pattern:
Strictly speaking, the name of any trainable variables must: ```python
for epoch_num in range(starting_epoch, max_epoch):
for local_step in range(steps_per_epoch):
run_step() # do something
```
* Start with the name of the enclosing variable_scope when the tower function is called. 1. Training is **running some iterations**.
* Not use the same variable_scope's name twice in its name. Tensorpack base trainer implements the logic of __running the iteration__.
* Not depend on name_scope's name. Users or derived trainers should implement __what the iteration is__.
* Not depend on any tensor's name (because the tensor's name may depend on name_scope's name).
Tensorpack layers create variables based on the name given to the layer: 2. Trainer assumes the existence of __"epoch"__, i.e. that the iterations run in double for-loops.
e.g., `Conv2D('test', x)` will open a variable scope named "test". `steps_per_epoch` can be any number you set
In order to respect the above rules, and it only affects the [schedule of callbacks](callback.html).
the name of the layer must not depend on name_scope's name or any tensor's name. In other words, an "epoch" in tensorpack is the __default period to run
4. It must __respect variable scope reuse__: callbacks__ (validation, summary, checkpoint, etc.).
* The creation of any trainable variables must __respect reuse__ variable scope. It has nothing to do with your dataset.
To respect variable reuse (i.e. sharing), use `tf.get_variable` instead of `tf.Variable` in the function.
On the other hand, for a non-trainable variable, it may be desirable to not reuse it between towers.
In this case, `tf.Variable` can be used to ensure creation of new variables in each tower even when `reuse=True`.
* Do not modify the reuse option (e.g., by `scope.reuse_variables()`) of a variable
scope that is not created by you. This affects other's code. You can always
open new scopes if you need the reuse option.
5. It must not create scopes or variables containing the name 'tower', as it is
reserved for special use.
These conventions are easy to follow, and most layer wrappers (e.g., ### Built-in Trainers
tf.layers/slim/tensorlayer) do follow them. Note that certain Keras layers do not
follow these conventions and will need some workarounds if used within tensorpack.
#### What You Can Do Inside Tower Function Tensorpack implements a few builtin trainers for __single-cost gradient-based optimization__,
1. Call any symbolic functions as long as they follow the above rules. as this is the most common type of task.
2. The tower function will be called under a If your training follows this pattern, you only need to __select a trainer__,
[TowerContext](../modules/tfutils.html#tensorpack.tfutils.tower.BaseTowerContext), and use it with its [training interface](training-interface.html).
which can be accessed by [get_current_tower_context()](../modules/tfutils.html#tensorpack.tfutils.tower.get_current_tower_context).
The context contains information about training/inference mode, scope name, etc.
You can use the context to build a different graph under different mode.
The simplest example of such a trainer is
[SimpleTrainer](../modules/train.html#tensorpack.train.SimpleTrainer).
All it does is building your model (which you have to provide) once
(or twice if inference is needed by callbacks) and minimizing its cost.
### Multi-GPU Trainers ### Multi-GPU Trainers
For data-parallel multi-GPU training, different [multi-GPU trainers](../modules/train.html) For data-parallel multi-GPU training, different [multi-GPU trainers](../modules/train.html)
implement different distribution strategies. implement different distribution strategies.
They take care of device placement, gradient averaging and synchronoization They take care of device placement, gradient averaging and synchronoization
in the efficient way and all reach the same performance as the in the efficient way, which is why multi-GPU training in tensorpack
[official TF benchmarks](https://www.tensorflow.org/performance/benchmarks). is up to
[5x faster than Keras](https://github.com/tensorpack/benchmarks/tree/master/other-wrappers).
It takes only one line of code change to use them, e.g. `trainer=SyncMultiGPUTrainerReplicated(...)`. It takes only one line of code change to use them, e.g. `trainer=SyncMultiGPUTrainerReplicated(...)`.
Note some __common problems__ when using these trainers: Note some __common confusions__ when using these trainers:
1. In each iteration, instead of taking one input tensor for all GPUs and split, 1. In each iteration, instead of taking one input tensor for all GPUs and split,
all GPUs take tensors from the `InputSource`. all GPUs take tensors from the `InputSource`.
...@@ -110,7 +92,7 @@ Note some __common problems__ when using these trainers: ...@@ -110,7 +92,7 @@ Note some __common problems__ when using these trainers:
``` ```
2. The tower function (your model code) will get called once on each GPU. 2. The tower function (your model code) will get called once on each GPU.
You must follow the abovementioned rules of tower function. You must follow some [rules of tower function](extend/trainer.html#rules-of-tower-function).
### Distributed Trainers ### Distributed Trainers
...@@ -121,4 +103,4 @@ documentation of [HorovodTrainer](../modules/train.html#tensorpack.train.Horovod ...@@ -121,4 +103,4 @@ documentation of [HorovodTrainer](../modules/train.html#tensorpack.train.Horovod
Tensorpack has implemented some other distributed trainers using TF's native API, Tensorpack has implemented some other distributed trainers using TF's native API,
but TensorFlow is not actively supporting its distributed training features, and but TensorFlow is not actively supporting its distributed training features, and
its native distributed performance isn't very good even today. its native distributed performance isn't very good even today.
Therefore those trainers are not actively maintained and are __not recommended for use__. Therefore those trainers are not maintained and are __not recommended for use__.
...@@ -59,7 +59,7 @@ This is a minimal implementation that simply contains these files: ...@@ -59,7 +59,7 @@ This is a minimal implementation that simply contains these files:
Training throughput (larger is better) of standard R50-FPN Mask R-CNN, on 8 V100s: Training throughput (larger is better) of standard R50-FPN Mask R-CNN, on 8 V100s:
| Implementation | Throughput (img/s) | | Implementation | Throughput (img/s) |
| - | - | |--------------------------------------------------------------------------------------------------------------------------------------------------|:------------------:|
| [torchvision](https://pytorch.org/blog/torchvision03/#segmentation-models) | 59 | | [torchvision](https://pytorch.org/blog/torchvision03/#segmentation-models) | 59 |
| tensorpack | 50 | | tensorpack | 50 |
| [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/MODEL_ZOO.md#end-to-end-faster-and-mask-r-cnn-baselines) | 35 | | [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/MODEL_ZOO.md#end-to-end-faster-and-mask-r-cnn-baselines) | 35 |
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment