Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
efc74f2d
Commit
efc74f2d
authored
Nov 09, 2016
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactor trainer
parent
d3167ba3
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
196 additions
and
172 deletions
+196
-172
tensorpack/train/base.py
tensorpack/train/base.py
+3
-0
tensorpack/train/multigpu.py
tensorpack/train/multigpu.py
+31
-40
tensorpack/train/queue.py
tensorpack/train/queue.py
+144
-0
tensorpack/train/trainer.py
tensorpack/train/trainer.py
+18
-132
No files found.
tensorpack/train/base.py
View file @
efc74f2d
...
...
@@ -30,8 +30,11 @@ class Trainer(object):
Available Attritbutes:
stat_holder: a `StatHolder` instance
summary_writer: a `tf.SummaryWriter`
summary_op: a `tf.Operation` which returns summary string
config: a `TrainConfig`
model: a `ModelDesc`
sess: a `tf.Session`
coord: a `tf.train.Coordinator`
"""
__metaclass__
=
ABCMeta
...
...
tensorpack/train/multigpu.py
View file @
efc74f2d
...
...
@@ -15,16 +15,42 @@ from ..tfutils import (backup_collection, restore_collection,
get_global_step_var
,
TowerContext
)
from
..tfutils.gradproc
import
apply_grad_processors
,
ScaleGradient
from
.trainer
import
QueueInputTrainer
from
.trainer
import
FeedlessTrainer
from
.queue
import
QueueInputTrainer
__all__
=
[
'AsyncMultiGPUTrainer'
,
'SyncMultiGPUTrainer'
]
class
MultiGPUTrainer
(
QueueInput
Trainer
):
class
MultiGPUTrainer
(
Feedless
Trainer
):
""" Base class for multi-gpu training"""
def
_multi_tower_grads
(
self
):
logger
.
info
(
"Training a model of {} tower"
.
format
(
len
(
self
.
config
.
tower
)))
grad_list
=
[]
global_scope
=
tf
.
get_variable_scope
()
for
idx
,
t
in
enumerate
(
self
.
config
.
tower
):
with
tf
.
device
(
'/gpu:{}'
.
format
(
t
)),
\
tf
.
variable_scope
(
global_scope
,
reuse
=
idx
>
0
),
\
TowerContext
(
'tower{}'
.
format
(
idx
))
as
scope
:
logger
.
info
(
"Building graph for training tower {}..."
.
format
(
idx
))
model_inputs
=
self
.
_get_input_tensors_noreuse
()
self
.
model
.
build_graph
(
model_inputs
)
cost_var
=
self
.
model
.
get_cost
()
# build tower
# TODO gate_gradienst=0 might be faster?
grad_list
.
append
(
self
.
config
.
optimizer
.
compute_gradients
(
cost_var
,
gate_gradients
=
0
))
if
idx
==
0
:
add_moving_summary
(
cost_var
)
# avoid repeated summary from each device
backup
=
backup_collection
(
SUMMARY_BACKUP_KEYS
)
restore_collection
(
backup
)
return
grad_list
class
SyncMultiGPUTrainer
(
QueueInputTrainer
,
MultiGPUTrainer
):
def
__init__
(
self
,
config
,
input_queue
=
None
,
predict_tower
=
None
):
super
(
MultiGPUTrainer
,
self
)
.
__init__
(
config
,
input_queue
,
predict_tower
)
assert
len
(
config
.
tower
)
>=
1
,
"MultiGPUTrainer must be used with at least one GPU."
self
.
dequed_inputs
=
[]
@
staticmethod
def
_average_grads
(
tower_grads
):
...
...
@@ -48,53 +74,18 @@ class MultiGPUTrainer(QueueInputTrainer):
ret
.
append
((
grad
,
v
))
return
ret
def
_multi_tower_grads
(
self
):
logger
.
info
(
"Training a model of {} tower"
.
format
(
len
(
self
.
config
.
tower
)))
grad_list
=
[]
global_scope
=
tf
.
get_variable_scope
()
for
idx
,
t
in
enumerate
(
self
.
config
.
tower
):
with
tf
.
device
(
'/gpu:{}'
.
format
(
t
)),
\
tf
.
variable_scope
(
global_scope
,
reuse
=
idx
>
0
),
\
TowerContext
(
'tower{}'
.
format
(
idx
))
as
scope
:
logger
.
info
(
"Building graph for training tower {}..."
.
format
(
idx
))
model_inputs
=
self
.
_get_dequeued_inputs
()
# each tower dequeue from input queue
self
.
dequed_inputs
.
append
(
model_inputs
)
self
.
model
.
build_graph
(
model_inputs
)
cost_var
=
self
.
model
.
get_cost
()
# build tower
# TODO gate_gradienst=0 might be faster?
grad_list
.
append
(
self
.
config
.
optimizer
.
compute_gradients
(
cost_var
,
gate_gradients
=
0
))
if
idx
==
0
:
add_moving_summary
(
cost_var
)
# avoid repeated summary from each device
backup
=
backup_collection
(
SUMMARY_BACKUP_KEYS
)
restore_collection
(
backup
)
return
grad_list
class
SyncMultiGPUTrainer
(
MultiGPUTrainer
):
def
_setup
(
self
):
self
.
_build_enque_thread
()
grad_list
=
self
.
_multi_tower_grads
()
grads
=
MultiGPUTrainer
.
_average_grads
(
grad_list
)
grads
=
SyncMultiGPUTrainer
.
_average_grads
(
grad_list
)
grads
=
apply_grad_processors
(
grads
,
self
.
model
.
get_gradient_processor
())
self
.
train_op
=
tf
.
group
(
self
.
config
.
optimizer
.
apply_gradients
(
grads
,
get_global_step_var
()),
summary_moving_average
(),
name
=
'train_op'
)
# [debug]: do nothing in training
#self.train_op = self.dequed_inputs[0][0] + self.dequed_inputs[1][0]
class
AsyncMultiGPUTrainer
(
MultiGPUTrainer
):
class
AsyncMultiGPUTrainer
(
QueueInputTrainer
,
MultiGPUTrainer
):
def
_setup
(
self
):
self
.
_build_enque_thread
()
grad_list
=
self
.
_multi_tower_grads
()
gradprocs
=
self
.
model
.
get_gradient_processor
()
# pretend to average the grads, in order to make async and
...
...
tensorpack/train/queue.py
0 → 100644
View file @
efc74f2d
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: queue.py
# Author: Yuxin Wu <ppwwyyxxc@gmail.com>
import
threading
import
tensorflow
as
tf
from
..dataflow.common
import
RepeatedData
from
..tfutils.summary
import
summary_moving_average
,
add_moving_summary
from
..tfutils
import
get_global_step_var
,
TowerContext
from
..utils
import
logger
from
..callbacks.concurrency
import
StartProcOrThread
from
..tfutils.gradproc
import
apply_grad_processors
from
.trainer
import
FeedlessTrainer
,
MultiPredictorTowerTrainer
__all__
=
[
'QueueInputTrainerBase'
,
'QueueInputTrainer'
]
class
EnqueueThread
(
threading
.
Thread
):
def
__init__
(
self
,
trainer
):
super
(
EnqueueThread
,
self
)
.
__init__
()
self
.
name
=
'EnqueueThread'
self
.
daemon
=
True
self
.
sess
=
trainer
.
sess
self
.
coord
=
trainer
.
coord
self
.
dataflow
=
RepeatedData
(
trainer
.
config
.
dataset
,
-
1
)
self
.
input_vars
=
trainer
.
input_vars
self
.
queue
=
trainer
.
input_queue
self
.
op
=
self
.
queue
.
enqueue
(
self
.
input_vars
)
self
.
close_op
=
self
.
queue
.
close
(
cancel_pending_enqueues
=
True
)
self
.
size_op
=
self
.
queue
.
size
()
add_moving_summary
(
tf
.
cast
(
self
.
size_op
,
tf
.
float32
,
name
=
'input_queue_size'
))
def
run
(
self
):
self
.
dataflow
.
reset_state
()
with
self
.
sess
.
as_default
():
try
:
while
True
:
for
dp
in
self
.
dataflow
.
get_data
():
if
self
.
coord
.
should_stop
():
return
feed
=
dict
(
zip
(
self
.
input_vars
,
dp
))
#print 'TFQ:', self.sess.run([self.op, self.size_op], feed_dict=feed)[1]
self
.
op
.
run
(
feed_dict
=
feed
)
except
tf
.
errors
.
CancelledError
as
e
:
pass
except
Exception
:
logger
.
exception
(
"Exception in EnqueueThread:"
)
finally
:
try
:
self
.
sess
.
run
(
self
.
close_op
)
except
RuntimeError
:
# session already closed
pass
self
.
coord
.
request_stop
()
logger
.
info
(
"Enqueue Thread Exited."
)
class
QueueInputTrainerBase
(
FeedlessTrainer
):
def
_build_enque_thread
(
self
,
input_queue
):
""" create a thread that keeps filling the queue """
self
.
input_vars
=
self
.
model
.
get_input_vars
()
if
input_queue
is
None
:
self
.
input_queue
=
tf
.
FIFOQueue
(
50
,
[
x
.
dtype
for
x
in
self
.
input_vars
],
name
=
'input_queue'
)
else
:
self
.
input_queue
=
input_queue
input_th
=
EnqueueThread
(
self
)
self
.
config
.
callbacks
.
append
(
StartProcOrThread
(
input_th
))
def
_get_input_tensors_noreuse
(
self
):
""" Dequeue a datapoint from input_queue and return.
Can be called multiple times.
"""
ret
=
self
.
input_queue
.
dequeue
(
name
=
'input_deque'
)
if
isinstance
(
ret
,
tf
.
Tensor
):
# only one input
ret
=
[
ret
]
assert
len
(
ret
)
==
len
(
self
.
input_vars
)
for
qv
,
v
in
zip
(
ret
,
self
.
input_vars
):
qv
.
set_shape
(
v
.
get_shape
())
# test the overhead of queue
#with tf.device('/gpu:0'):
#ret = [tf.Variable(tf.random_normal([128,224,224,3],
#dtype=tf.float32), trainable=False),
#tf.Variable(tf.ones([128], dtype=tf.int32), trainable=False)]
return
ret
class
QueueInputTrainer
(
MultiPredictorTowerTrainer
,
QueueInputTrainerBase
):
""" Single GPU Trainer, takes input from a queue"""
def
__init__
(
self
,
config
,
input_queue
=
None
,
predict_tower
=
None
):
"""
:param config: a `TrainConfig` instance
:param input_queue: a `tf.QueueBase` instance to be used to buffer datapoints.
Defaults to a FIFO queue of size 100.
:param predict_tower: list of gpu relative idx to run prediction. default to be [0].
Use -1 for cpu.
"""
super
(
QueueInputTrainer
,
self
)
.
__init__
(
config
)
self
.
_setup_predictor_factory
(
predict_tower
)
self
.
_build_enque_thread
(
input_queue
)
def
_single_tower_grad
(
self
,
actual_inputs
):
""" Get grad and cost for single-tower"""
with
TowerContext
(
''
):
self
.
model
.
build_graph
(
actual_inputs
)
cost_var
=
self
.
model
.
get_cost
()
grads
=
self
.
config
.
optimizer
.
compute_gradients
(
cost_var
,
gate_gradients
=
0
)
# GATE_NONE
add_moving_summary
(
cost_var
)
return
grads
def
_setup
(
self
):
assert
len
(
self
.
config
.
tower
)
==
1
,
\
"QueueInputTrainer doesn't support multigpu! Use Sync/AsyncMultiGPUTrainer instead."
actual_inputs
=
self
.
_get_input_tensors_noreuse
()
grads
=
self
.
_single_tower_grad
(
actual_inputs
)
grads
=
apply_grad_processors
(
grads
,
self
.
model
.
get_gradient_processor
())
self
.
train_op
=
tf
.
group
(
self
.
config
.
optimizer
.
apply_gradients
(
grads
,
get_global_step_var
()),
summary_moving_average
(),
name
=
'train_op'
)
# skip training
#self.train_op = tf.group(*self.dequed_inputs)
def
run_step
(
self
):
""" Simply run self.train_op"""
self
.
sess
.
run
(
self
.
train_op
)
# debug-benchmark code:
#run_metadata = tf.RunMetadata()
#self.sess.run([self.train_op],
#options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
#run_metadata=run_metadata
#)
#from tensorflow.python.client import timeline
#trace = timeline.Timeline(step_stats=run_metadata.step_stats)
#trace_file = open('timeline.ctf.json', 'w')
#trace_file.write(trace.generate_chrome_trace_format())
#import sys; sys.exit()
tensorpack/train/trainer.py
View file @
efc74f2d
...
...
@@ -3,7 +3,6 @@
# Author: Yuxin Wu <ppwwyyxx@gmail.com>
import
tensorflow
as
tf
import
threading
import
time
from
six.moves
import
zip
...
...
@@ -16,10 +15,9 @@ from ..tfutils import (get_vars_by_names, freeze_collection,
get_global_step_var
,
TowerContext
)
from
..tfutils.summary
import
summary_moving_average
,
add_moving_summary
from
..predict
import
OnlinePredictor
,
build_multi_tower_prediction_graph
from
..callbacks.concurrency
import
StartProcOrThread
from
..tfutils.gradproc
import
apply_grad_processors
__all__
=
[
'SimpleTrainer'
,
'
QueueInput
Trainer'
]
__all__
=
[
'SimpleTrainer'
,
'
FeedlessTrainer'
,
'MultiPredictorTower
Trainer'
]
class
PredictorFactory
(
object
):
""" Make predictors for a trainer"""
...
...
@@ -55,6 +53,7 @@ class PredictorFactory(object):
self
.
tower_built
=
True
class
SimpleTrainer
(
Trainer
):
""" A naive demo trainer """
def
__init__
(
self
,
config
):
super
(
SimpleTrainer
,
self
)
.
__init__
(
config
)
self
.
_predictor_factory
=
PredictorFactory
(
self
.
sess
,
self
.
model
,
[
0
])
...
...
@@ -94,134 +93,26 @@ class SimpleTrainer(Trainer):
def
get_predict_func
(
self
,
input_names
,
output_names
):
return
self
.
_predictor_factory
.
get_predictor
(
input_names
,
output_names
,
0
)
class
EnqueueThread
(
threading
.
Thread
):
def
__init__
(
self
,
trainer
):
super
(
EnqueueThread
,
self
)
.
__init__
()
self
.
name
=
'EnqueueThread'
self
.
daemon
=
True
self
.
sess
=
trainer
.
sess
self
.
coord
=
trainer
.
coord
self
.
dataflow
=
RepeatedData
(
trainer
.
config
.
dataset
,
-
1
)
self
.
input_vars
=
trainer
.
input_vars
self
.
queue
=
trainer
.
input_queue
self
.
op
=
self
.
queue
.
enqueue
(
self
.
input_vars
)
self
.
close_op
=
self
.
queue
.
close
(
cancel_pending_enqueues
=
True
)
self
.
size_op
=
self
.
queue
.
size
()
add_moving_summary
(
tf
.
cast
(
self
.
size_op
,
tf
.
float32
,
name
=
'input_queue_size'
))
def
run
(
self
):
self
.
dataflow
.
reset_state
()
with
self
.
sess
.
as_default
():
try
:
while
True
:
for
dp
in
self
.
dataflow
.
get_data
():
if
self
.
coord
.
should_stop
():
return
feed
=
dict
(
zip
(
self
.
input_vars
,
dp
))
#print 'TFQ:', self.sess.run([self.op, self.size_op], feed_dict=feed)[1]
self
.
op
.
run
(
feed_dict
=
feed
)
except
tf
.
errors
.
CancelledError
as
e
:
pass
except
Exception
:
logger
.
exception
(
"Exception in EnqueueThread:"
)
finally
:
try
:
self
.
sess
.
run
(
self
.
close_op
)
except
RuntimeError
:
# session already closed
pass
self
.
coord
.
request_stop
()
logger
.
info
(
"Enqueue Thread Exited."
)
class
QueueInputTrainer
(
Trainer
):
""" Single GPU Trainer, takes input from a queue"""
def
__init__
(
self
,
config
,
input_queue
=
None
,
predict_tower
=
None
):
"""
:param config: a `TrainConfig` instance
:param input_queue: a `tf.QueueBase` instance to be used to buffer datapoints.
Defaults to a FIFO queue of size 100.
:param predict_tower: list of gpu relative idx to run prediction. default to be [0].
Use -1 for cpu.
"""
super
(
QueueInputTrainer
,
self
)
.
__init__
(
config
)
self
.
input_vars
=
self
.
model
.
get_input_vars
()
if
input_queue
is
None
:
self
.
input_queue
=
tf
.
FIFOQueue
(
50
,
[
x
.
dtype
for
x
in
self
.
input_vars
],
name
=
'input_queue'
)
else
:
self
.
input_queue
=
input_queue
class
MultiPredictorTowerTrainer
(
Trainer
):
""" A trainer with possibly multiple prediction tower """
def
_setup_predictor_factory
(
self
,
predict_tower
):
# by default, use the first training gpu for prediction
predict_tower
=
predict_tower
or
[
0
]
self
.
_predictor_factory
=
PredictorFactory
(
self
.
sess
,
self
.
model
,
predict_tower
)
self
.
dequed_inputs
=
None
def
_get_dequeued_inputs
(
self
):
""" Dequeue a datapoint from input_queue and return"""
ret
=
self
.
input_queue
.
dequeue
(
name
=
'input_deque'
)
if
isinstance
(
ret
,
tf
.
Tensor
):
# only one input
ret
=
[
ret
]
assert
len
(
ret
)
==
len
(
self
.
input_vars
)
for
qv
,
v
in
zip
(
ret
,
self
.
input_vars
):
qv
.
set_shape
(
v
.
get_shape
())
return
ret
def
_single_tower_grad
(
self
):
""" Get grad and cost for single-tower"""
self
.
dequed_inputs
=
model_inputs
=
self
.
_get_dequeued_inputs
()
# test the overhead of queue
#with tf.device('/gpu:0'):
#self.dequed_inputs = [tf.Variable(tf.random_normal([128,224,224,3],
#dtype=tf.float32), trainable=False),
#tf.Variable(tf.ones([128], dtype=tf.int32), trainable=False)]
with
TowerContext
(
''
):
self
.
model
.
build_graph
(
self
.
dequed_inputs
)
cost_var
=
self
.
model
.
get_cost
()
grads
=
self
.
config
.
optimizer
.
compute_gradients
(
cost_var
,
gate_gradients
=
0
)
# GATE_NONE
add_moving_summary
(
cost_var
)
return
grads
def
_build_enque_thread
(
self
):
""" create a thread that keeps filling the queue """
self
.
input_th
=
EnqueueThread
(
self
)
self
.
config
.
callbacks
.
append
(
StartProcOrThread
(
self
.
input_th
))
def
_setup
(
self
):
assert
len
(
self
.
config
.
tower
)
==
1
,
\
"QueueInputTrainer doesn't support multigpu! Use Sync/AsyncMultiGPUTrainer instead."
self
.
_build_enque_thread
()
grads
=
self
.
_single_tower_grad
()
grads
=
apply_grad_processors
(
grads
,
self
.
model
.
get_gradient_processor
())
self
.
train_op
=
tf
.
group
(
self
.
config
.
optimizer
.
apply_gradients
(
grads
,
get_global_step_var
()),
summary_moving_average
(),
name
=
'train_op'
)
# skip training
#self.train_op = tf.group(*self.dequed_inputs)
def
get_predict_func
(
self
,
input_names
,
output_names
,
tower
=
0
):
"""
:param tower: return the kth predict_func
:returns: an `OnlinePredictor`
"""
return
self
.
_predictor_factory
.
get_predictor
(
input_names
,
output_names
,
tower
)
def
run_step
(
self
):
""" Simply run self.train_op"""
self
.
sess
.
run
(
self
.
train_op
)
#run_metadata = tf.RunMetadata()
#self.sess.run([self.train_op],
#options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
#run_metadata=run_metadata
#)
#from tensorflow.python.client import timeline
#trace = timeline.Timeline(step_stats=run_metadata.step_stats)
#trace_file = open('timeline.ctf.json', 'w')
#trace_file.write(trace.generate_chrome_trace_format())
#import sys; sys.exit()
def
get_predict_funcs
(
self
,
input_names
,
output_names
,
n
):
return
[
self
.
get_predict_func
(
input_names
,
output_names
,
k
)
for
k
in
range
(
n
)]
class
FeedlessTrainer
(
Trainer
):
""" A trainer which runs iteration without feed_dict (therefore faster) """
def
_trigger_epoch
(
self
):
# need to run summary_op every epoch
# note that summary_op will take a data from the queue
...
...
@@ -229,12 +120,7 @@ class QueueInputTrainer(Trainer):
summary_str
=
self
.
summary_op
.
eval
()
self
.
_process_summary
(
summary_str
)
def
get_predict_func
(
self
,
input_names
,
output_names
,
tower
=
0
):
def
_get_input_tensors_noreuse
(
self
):
""" return a list of actual input tensors.
Always return new tensors (for multi tower) if called mutliple times.
"""
:param tower: return the kth predict_func
:returns: an `OnlinePredictor`
"""
return
self
.
_predictor_factory
.
get_predictor
(
input_names
,
output_names
,
tower
)
def
get_predict_funcs
(
self
,
input_names
,
output_names
,
n
):
return
[
self
.
get_predict_func
(
input_names
,
output_names
,
k
)
for
k
in
range
(
n
)]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment