Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
fa34d239
Commit
fa34d239
authored
Jun 14, 2018
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Make all monitors chief_only=False by default, improve warnings (#791)
parent
b96cb78c
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
27 additions
and
14 deletions
+27
-14
tensorpack/callbacks/monitor.py
tensorpack/callbacks/monitor.py
+13
-8
tensorpack/callbacks/prof.py
tensorpack/callbacks/prof.py
+12
-5
tensorpack/dataflow/remote.py
tensorpack/dataflow/remote.py
+1
-0
tensorpack/train/trainers.py
tensorpack/train/trainers.py
+1
-1
No files found.
tensorpack/callbacks/monitor.py
View file @
fa34d239
...
...
@@ -46,6 +46,9 @@ class TrainingMonitor(Callback):
.. document private functions
.. automethod:: _setup_graph
"""
_chief_only
=
False
def
setup_graph
(
self
,
trainer
):
self
.
trainer
=
trainer
self
.
_setup_graph
()
...
...
@@ -92,7 +95,13 @@ class TrainingMonitor(Callback):
class
NoOpMonitor
(
TrainingMonitor
):
pass
def
__init__
(
self
,
name
=
None
):
self
.
_name
=
name
def
__str__
(
self
):
if
self
.
_name
is
None
:
return
"NoOpMonitor"
return
"NoOpMonitor({})"
.
format
(
self
.
_name
)
class
Monitors
(
Callback
):
...
...
@@ -221,7 +230,7 @@ class TFEventWriter(TrainingMonitor):
self
.
_flush_secs
=
flush_secs
self
.
_split_files
=
split_files
def
__new__
(
cls
,
logdir
=
None
,
max_queue
=
10
,
flush_secs
=
120
):
def
__new__
(
cls
,
logdir
=
None
,
max_queue
=
10
,
flush_secs
=
120
,
**
kwargs
):
if
logdir
is
None
:
logdir
=
logger
.
get_logger_dir
()
...
...
@@ -229,7 +238,7 @@ class TFEventWriter(TrainingMonitor):
return
super
(
TFEventWriter
,
cls
)
.
__new__
(
cls
)
else
:
logger
.
warn
(
"logger directory was not set. Ignore TFEventWriter."
)
return
NoOpMonitor
()
return
NoOpMonitor
(
"TFEventWriter"
)
def
_setup_graph
(
self
):
self
.
_writer
=
tf
.
summary
.
FileWriter
(
...
...
@@ -268,7 +277,7 @@ class JSONWriter(TrainingMonitor):
return
super
(
JSONWriter
,
cls
)
.
__new__
(
cls
)
else
:
logger
.
warn
(
"logger directory was not set. Ignore JSONWriter."
)
return
NoOpMonitor
()
return
NoOpMonitor
(
"JSONWriter"
)
@
staticmethod
def
load_existing_json
():
...
...
@@ -370,8 +379,6 @@ class ScalarPrinter(TrainingMonitor):
Print scalar data into terminal.
"""
_chief_only
=
False
def
__init__
(
self
,
enable_step
=
False
,
enable_epoch
=
True
,
whitelist
=
None
,
blacklist
=
None
):
"""
...
...
@@ -439,8 +446,6 @@ class ScalarHistory(TrainingMonitor):
Only used by monitors internally.
"""
_chief_only
=
False
def
_setup_graph
(
self
):
self
.
_dic
=
defaultdict
(
list
)
...
...
tensorpack/callbacks/prof.py
View file @
fa34d239
...
...
@@ -114,7 +114,7 @@ class GPUUtilizationTracker(Callback):
class
GraphProfiler
(
Callback
):
"""
Enable profiling by installing session hooks,
and write
metadata or tracing files
to ``logger.get_logger_dir()``.
and write
tracing files / events / metadata
to ``logger.get_logger_dir()``.
The tracing files can be loaded from ``chrome://tracing``.
The metadata files can be processed by
...
...
@@ -122,9 +122,16 @@ class GraphProfiler(Callback):
<https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/profiler/README.md>`_.
The event is viewable from tensorboard.
Note that the profiling is enabled for every step.
You probably want to schedule it less frequently by
:class:`PeriodicRunHooks`.
Tips:
Note that the profiling is by default enabled for every step and is expensive.
You probably want to schedule it less frequently, e.g.:
.. code-block:: none
EnableCallbackIf(
GraphProfiler(dump_tracing=True, dump_event=True),
lambda self: self.trainer.global_step > 20 and self.trainer.global_step < 30)
"""
def
__init__
(
self
,
dump_metadata
=
False
,
dump_tracing
=
True
,
dump_event
=
False
):
"""
...
...
@@ -138,7 +145,7 @@ class GraphProfiler(Callback):
self
.
_dump_meta
=
bool
(
dump_metadata
)
self
.
_dump_tracing
=
bool
(
dump_tracing
)
self
.
_dump_event
=
bool
(
dump_event
)
assert
os
.
path
.
isdir
(
self
.
_dir
)
assert
os
.
path
.
isdir
(
self
.
_dir
)
,
self
.
_dir
def
_before_run
(
self
,
_
):
opt
=
tf
.
RunOptions
()
...
...
tensorpack/dataflow/remote.py
View file @
fa34d239
...
...
@@ -74,6 +74,7 @@ def send_dataflow_zmq(df, addr, hwm=50, format=None, bind=False):
avg
=
"{:.3f}"
.
format
(
sum
(
q
)
/
len
(
q
))
pbar
.
set_postfix
({
'AvgSendLat'
:
avg
})
finally
:
logger
.
info
(
"Exiting send_dataflow_zmq ..."
)
socket
.
setsockopt
(
zmq
.
LINGER
,
0
)
socket
.
close
()
if
not
ctx
.
closed
:
...
...
tensorpack/train/trainers.py
View file @
fa34d239
...
...
@@ -343,7 +343,7 @@ class HorovodTrainer(SingleCostTrainer):
self
.
is_chief
=
hvd
.
rank
()
==
0
self
.
_local_rank
=
hvd
.
local_rank
()
self
.
_average
=
average
logger
.
info
(
"
Horovod
local rank={}"
.
format
(
self
.
_local_rank
))
logger
.
info
(
"
[HorovodTrainer]
local rank={}"
.
format
(
self
.
_local_rank
))
super
(
HorovodTrainer
,
self
)
.
__init__
()
def
allreduce
(
self
,
grads
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment