Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
acd7f798
Commit
acd7f798
authored
Jul 23, 2016
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
use relative id in TrainConfig.tower
parent
f6acf786
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
27 additions
and
14 deletions
+27
-14
README.md
README.md
+1
-1
examples/cifar-convnet.py
examples/cifar-convnet.py
+1
-1
examples/svhn-digit-convnet.py
examples/svhn-digit-convnet.py
+2
-2
tensorpack/RL/history.py
tensorpack/RL/history.py
+1
-0
tensorpack/train/config.py
tensorpack/train/config.py
+13
-2
tensorpack/train/multigpu.py
tensorpack/train/multigpu.py
+7
-6
tensorpack/train/trainer.py
tensorpack/train/trainer.py
+2
-2
No files found.
README.md
View file @
acd7f798
...
...
@@ -12,7 +12,7 @@ See some interesting [examples](https://github.com/ppwwyyxx/tensorpack/tree/mast
## Features:
Focus
ed on modularity. Just have to define the
three components to start a training:
Focus
on modularity. You just have to define the following
three components to start a training:
1.
The model, or the graph. Define the graph as well as its inputs and outputs.
`models/`
has some scoped abstraction of common models.
...
...
examples/cifar-convnet.py
View file @
acd7f798
...
...
@@ -20,7 +20,7 @@ Cifar10:
91
%
accuracy after 80k step.
19.3 step/s on Tesla M40
Not a good for Cifar100, just for demonstration.
Not a good
model
for Cifar100, just for demonstration.
"""
class
Model
(
ModelDesc
):
...
...
examples/svhn-digit-convnet.py
View file @
acd7f798
...
...
@@ -13,10 +13,10 @@ from tensorpack.tfutils.symbolic_functions import *
from
tensorpack.tfutils.summary
import
*
"""
SVHN convnet
.
A very small SVHN convnet model (only 0.8m parameters)
.
About 3.0
%
validation error after 70 epoch. 2.5
%
after 130 epoch.
Each epoch is set to 4721 iterations. The speed is about 44 it/s on a Tesla M
3
0
Each epoch is set to 4721 iterations. The speed is about 44 it/s on a Tesla M
4
0
"""
class
Model
(
ModelDesc
):
...
...
tensorpack/RL/history.py
View file @
acd7f798
...
...
@@ -32,6 +32,7 @@ class HistoryFramePlayer(ProxyPlayer):
zeros
=
[
np
.
zeros_like
(
self
.
history
[
0
])
for
k
in
range
(
diff_len
)]
for
k
in
self
.
history
:
zeros
.
append
(
k
)
assert
len
(
zeros
)
==
self
.
history
.
maxlen
return
np
.
concatenate
(
zeros
,
axis
=
2
)
def
action
(
self
,
act
):
...
...
tensorpack/train/config.py
View file @
acd7f798
...
...
@@ -31,7 +31,8 @@ class TrainConfig(object):
:param starting_epoch: int. default to be 1.
:param step_per_epoch: the number of steps (SGD updates) to perform in each epoch.
:param max_epoch: maximum number of epoch to run training. default to inf
:param nr_tower: int. number of towers. default to 1.
:param nr_tower: int. number of training towers. default to 1.
:param tower: list of training towers in relative id. default to `range(nr_tower)` if nr_tower is given.
:param extra_threads_procs: list of `Startable` threads or processes
"""
def
assert_type
(
v
,
tp
):
...
...
@@ -53,7 +54,17 @@ class TrainConfig(object):
self
.
starting_epoch
=
int
(
kwargs
.
pop
(
'starting_epoch'
,
1
))
self
.
max_epoch
=
int
(
kwargs
.
pop
(
'max_epoch'
,
99999
))
assert
self
.
step_per_epoch
>
0
and
self
.
max_epoch
>
0
self
.
nr_tower
=
int
(
kwargs
.
pop
(
'nr_tower'
,
1
))
nr_tower
=
kwargs
.
pop
(
'nr_tower'
,
None
)
tower
=
kwargs
.
pop
(
'tower'
,
None
)
assert
nr_tower
is
None
or
tower
is
None
,
"Cannot set both nr_tower and tower!"
if
nr_tower
:
tower
=
list
(
range
(
nr_tower
))
else
:
if
isinstance
(
tower
,
int
):
tower
=
list
(
range
(
tower
))
self
.
tower
=
tower
self
.
extra_threads_procs
=
kwargs
.
pop
(
'extra_threads_procs'
,
[])
assert
len
(
kwargs
)
==
0
,
'Unknown arguments: {}'
.
format
(
str
(
kwargs
.
keys
()))
tensorpack/train/multigpu.py
View file @
acd7f798
...
...
@@ -38,13 +38,14 @@ class MultiGPUTrainer(QueueInputTrainer):
return
ret
def
_multi_tower_grads
(
self
):
logger
.
info
(
"Training a model of {} tower"
.
format
(
self
.
config
.
nr_tower
))
logger
.
info
(
"Training a model of {} tower"
.
format
(
len
(
self
.
config
.
tower
)))
grad_list
=
[]
for
i
in
range
(
self
.
config
.
nr_
tower
):
with
tf
.
device
(
'/gpu:{}'
.
format
(
i
)),
\
tf
.
name_scope
(
'tower{}'
.
format
(
i
))
as
scope
:
logger
.
info
(
"Building graph for training tower {}..."
.
format
(
i
))
for
i
dx
,
t
in
enumerate
(
self
.
config
.
tower
):
with
tf
.
device
(
'/gpu:{}'
.
format
(
t
)),
\
tf
.
name_scope
(
'tower{}'
.
format
(
i
dx
))
as
scope
:
logger
.
info
(
"Building graph for training tower {}..."
.
format
(
i
dx
))
model_inputs
=
self
.
_get_model_inputs
()
# each tower dequeue from input queue
self
.
dequed_inputs
.
append
(
model_inputs
)
...
...
@@ -55,7 +56,7 @@ class MultiGPUTrainer(QueueInputTrainer):
grad_list
.
append
(
self
.
config
.
optimizer
.
compute_gradients
(
cost_var
,
gate_gradients
=
0
))
if
i
==
0
:
if
i
dx
==
0
:
tf
.
add_to_collection
(
MOVING_SUMMARY_VARS_KEY
,
cost_var
)
tf
.
get_variable_scope
()
.
reuse_variables
()
# avoid repeated summary from each device
...
...
tensorpack/train/trainer.py
View file @
acd7f798
...
...
@@ -138,7 +138,7 @@ class QueueInputTrainer(Trainer):
:param config: a `TrainConfig` instance
:param input_queue: a `tf.QueueBase` instance to be used to buffer datapoints.
Defaults to a FIFO queue of size 100.
:param predict_tower: list of gpu idx to run prediction. default to be [0].
:param predict_tower: list of gpu
relative
idx to run prediction. default to be [0].
Use -1 for cpu.
"""
super
(
QueueInputTrainer
,
self
)
.
__init__
(
config
)
...
...
@@ -189,7 +189,7 @@ class QueueInputTrainer(Trainer):
self
.
_extra_threads_procs
.
append
(
self
.
input_th
)
def
train
(
self
):
assert
self
.
config
.
nr_tower
==
1
,
\
assert
len
(
self
.
config
.
tower
)
==
1
,
\
"QueueInputTrainer doesn't support multigpu! Use Sync/AsyncMultiGPUTrainer instead."
self
.
init_session_and_coord
()
self
.
_build_enque_thread
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment