Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
dc68ce0d
Commit
dc68ce0d
authored
Dec 16, 2017
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix chief_only for input callbacks
parent
f2d2501b
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
15 additions
and
13 deletions
+15
-13
examples/ResNet/imagenet-resnet.py
examples/ResNet/imagenet-resnet.py
+3
-3
examples/ResNet/imagenet_utils.py
examples/ResNet/imagenet_utils.py
+0
-2
tensorpack/input_source/input_source.py
tensorpack/input_source/input_source.py
+1
-4
tensorpack/input_source/input_source_base.py
tensorpack/input_source/input_source_base.py
+8
-1
tensorpack/train/trainers.py
tensorpack/train/trainers.py
+3
-3
No files found.
examples/ResNet/imagenet-resnet.py
View file @
dc68ce0d
...
...
@@ -30,6 +30,9 @@ class Model(ImageNetModel):
def
__init__
(
self
,
depth
,
data_format
=
'NCHW'
,
mode
=
'resnet'
):
super
(
Model
,
self
)
.
__init__
(
data_format
)
if
mode
==
'se'
:
assert
depth
>=
50
self
.
mode
=
mode
basicblock
=
preresnet_basicblock
if
mode
==
'preact'
else
resnet_basicblock
bottleneck
=
{
...
...
@@ -115,9 +118,6 @@ if __name__ == '__main__':
if
args
.
gpu
:
os
.
environ
[
'CUDA_VISIBLE_DEVICES'
]
=
args
.
gpu
if
args
.
mode
==
'se'
:
assert
args
.
depth
>=
50
model
=
Model
(
args
.
depth
,
args
.
data_format
,
args
.
mode
)
if
args
.
eval
:
batch
=
128
# something that can run on one gpu
...
...
examples/ResNet/imagenet_utils.py
View file @
dc68ce0d
...
...
@@ -142,8 +142,6 @@ class ImageNetModel(ModelDesc):
image_dtype
=
tf
.
uint8
def
__init__
(
self
,
data_format
=
'NCHW'
):
if
data_format
==
'NCHW'
:
assert
tf
.
test
.
is_gpu_available
()
self
.
data_format
=
data_format
def
_get_inputs
(
self
):
...
...
tensorpack/input_source/input_source.py
View file @
dc68ce0d
...
...
@@ -32,9 +32,7 @@ __all__ = ['PlaceholderInput', 'FeedInput', 'FeedfreeInput',
def
_get_reset_callback
(
df
):
ret
=
CallbackFactory
(
setup_graph
=
lambda
_
:
df
.
reset_state
())
ret
.
chief_only
=
False
return
ret
return
CallbackFactory
(
setup_graph
=
lambda
_
:
df
.
reset_state
())
class
PlaceholderInput
(
InputSource
):
...
...
@@ -240,7 +238,6 @@ class QueueInput(FeedfreeInput):
def
_get_callbacks
(
self
):
from
..callbacks.concurrency
import
StartProcOrThread
cb
=
StartProcOrThread
(
self
.
thread
)
cb
.
chief_only
=
False
return
[
cb
,
self
.
_create_ema_callback
(),
_get_reset_callback
(
self
.
_inf_ds
)]
def
_get_input_tensors
(
self
):
...
...
tensorpack/input_source/input_source_base.py
View file @
dc68ce0d
...
...
@@ -115,13 +115,20 @@ class InputSource(object):
which is done also through the Callback interface.
This method returns the callbacks and the return value will be memoized.
All callbacks will be automatically marked as `chief_only=False`,
so they will run on all nodes.
Returns:
list[Callback]: extra callbacks needed by this InputSource.
"""
assert
self
.
setup_done
()
ret
urn
[
CallbackFactory
(
ret
=
[
CallbackFactory
(
before_train
=
lambda
_
:
self
.
reset_state
())]
+
self
.
_get_callbacks
()
for
r
in
ret
:
r
.
chief_only
=
False
# no input callbacks should be chief-only
return
ret
def
_get_callbacks
(
self
):
return
[]
...
...
tensorpack/train/trainers.py
View file @
dc68ce0d
...
...
@@ -183,7 +183,6 @@ class DistributedTrainerReplicated(SingleCostTrainer):
logger
.
info
(
"Distributed training on cluster:
\n
"
+
str
(
server
.
server_def
.
cluster
))
super
(
DistributedTrainerReplicated
,
self
)
.
__init__
()
def
_setup_input
(
self
,
inputs_desc
,
input
):
if
self
.
job_name
==
'ps'
:
# ps shouldn't setup input either
logger
.
info
(
"Running ps {}"
.
format
(
self
.
server
.
server_def
.
task_index
))
...
...
@@ -191,6 +190,7 @@ class DistributedTrainerReplicated(SingleCostTrainer):
self
.
server
.
join
()
# this function will never return tensorflow#4713
raise
RuntimeError
(
"This is a bug. Server.join() for ps should never return!"
)
def
_setup_input
(
self
,
inputs_desc
,
input
):
with
override_to_local_variable
():
get_global_step_var
()
# gs should be local
# input source may create variable (queue size summary)
...
...
@@ -205,13 +205,13 @@ class DistributedTrainerReplicated(SingleCostTrainer):
self
.
_make_get_grad_fn
(
input
,
get_cost_fn
,
get_opt_fn
),
get_opt_fn
)
callbacks
=
[]
#
initial local_vars syncing
#
Initial syncing vars from PS
cb
=
RunOp
(
lambda
:
initial_sync_op
,
run_before
=
True
,
run_as_trigger
=
False
,
verbose
=
True
)
cb
.
chief_only
=
False
callbacks
.
append
(
cb
)
#
model_variables syncing
#
Sync model_variables to PS, only chief needs to do this
if
model_sync_op
:
cb
=
RunOp
(
lambda
:
model_sync_op
,
run_before
=
False
,
run_as_trigger
=
True
,
verbose
=
True
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment