Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
c51ce295
Commit
c51ce295
authored
Apr 22, 2019
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add HostMemoryTracker
parent
31cfcadf
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
40 additions
and
12 deletions
+40
-12
docs/conf.py
docs/conf.py
+2
-1
examples/FasterRCNN/train.py
examples/FasterRCNN/train.py
+3
-2
setup.py
setup.py
+1
-1
tensorpack/callbacks/misc.py
tensorpack/callbacks/misc.py
+3
-3
tensorpack/callbacks/prof.py
tensorpack/callbacks/prof.py
+30
-4
tensorpack/callbacks/steps.py
tensorpack/callbacks/steps.py
+1
-1
No files found.
docs/conf.py
View file @
c51ce295
...
@@ -31,7 +31,7 @@ MOCK_MODULES = ['tabulate', 'h5py',
...
@@ -31,7 +31,7 @@ MOCK_MODULES = ['tabulate', 'h5py',
'scipy'
,
'scipy.misc'
,
'scipy.io'
,
'scipy'
,
'scipy.misc'
,
'scipy.io'
,
'tornado'
,
'tornado.concurrent'
,
'tornado'
,
'tornado.concurrent'
,
'horovod'
,
'horovod.tensorflow'
,
'horovod'
,
'horovod.tensorflow'
,
'subprocess32'
,
'functools32'
]
'subprocess32'
,
'functools32'
,
'psutil'
]
# it's better to have tensorflow installed (for some docs to show)
# it's better to have tensorflow installed (for some docs to show)
# but it's OK to mock it as well
# but it's OK to mock it as well
...
@@ -385,6 +385,7 @@ _DEPRECATED_NAMES = set([
...
@@ -385,6 +385,7 @@ _DEPRECATED_NAMES = set([
'start_test'
,
# TestDataSpeed
'start_test'
,
# TestDataSpeed
'ThreadedMapData'
,
'ThreadedMapData'
,
'TrainingMonitor'
,
'TrainingMonitor'
,
'PeakMemoryTracker'
,
# deprecated or renamed symbolic code
# deprecated or renamed symbolic code
'Deconv2D'
,
'psnr'
,
'Deconv2D'
,
'psnr'
,
...
...
examples/FasterRCNN/train.py
View file @
c51ce295
...
@@ -485,9 +485,10 @@ if __name__ == '__main__':
...
@@ -485,9 +485,10 @@ if __name__ == '__main__':
ScheduledHyperParamSetter
(
ScheduledHyperParamSetter
(
'learning_rate'
,
warmup_schedule
,
interp
=
'linear'
,
step_based
=
True
),
'learning_rate'
,
warmup_schedule
,
interp
=
'linear'
,
step_based
=
True
),
ScheduledHyperParamSetter
(
'learning_rate'
,
lr_schedule
),
ScheduledHyperParamSetter
(
'learning_rate'
,
lr_schedule
),
PeakMemoryTracker
(),
GPUMemoryTracker
(),
HostMemoryTracker
(),
EstimatedTimeLeft
(
median
=
True
),
EstimatedTimeLeft
(
median
=
True
),
SessionRunTimeout
(
60000
)
.
set_chief_only
(
True
)
,
# 1 minute timeout
SessionRunTimeout
(
60000
),
# 1 minute timeout
]
]
if
cfg
.
TRAIN
.
EVAL_PERIOD
>
0
:
if
cfg
.
TRAIN
.
EVAL_PERIOD
>
0
:
callbacks
.
extend
([
callbacks
.
extend
([
...
...
setup.py
View file @
c51ce295
...
@@ -47,7 +47,7 @@ setup(
...
@@ -47,7 +47,7 @@ setup(
"six"
,
"six"
,
"termcolor>=1.1"
,
"termcolor>=1.1"
,
"tabulate>=0.7.7"
,
"tabulate>=0.7.7"
,
"tqdm>4.
11.1
"
,
"tqdm>4.
29.0
"
,
"msgpack>=0.5.2"
,
"msgpack>=0.5.2"
,
"msgpack-numpy>=0.4.4.2"
,
"msgpack-numpy>=0.4.4.2"
,
"pyzmq>=16"
,
"pyzmq>=16"
,
...
...
tensorpack/callbacks/misc.py
View file @
c51ce295
...
@@ -46,7 +46,7 @@ class InjectShell(Callback):
...
@@ -46,7 +46,7 @@ class InjectShell(Callback):
callbacks=[InjectShell('/path/to/pause-training.tmp'), ...]
callbacks=[InjectShell('/path/to/pause-training.tmp'), ...]
# the following command will pause the training when the epoch finishes:
# the following command will pause the training
and start a shell
when the epoch finishes:
$ touch /path/to/pause-training.tmp
$ touch /path/to/pause-training.tmp
"""
"""
...
@@ -85,11 +85,11 @@ class EstimatedTimeLeft(Callback):
...
@@ -85,11 +85,11 @@ class EstimatedTimeLeft(Callback):
"""
"""
Estimate the time left until completion of training.
Estimate the time left until completion of training.
"""
"""
def
__init__
(
self
,
last_k_epochs
=
5
,
median
=
Fals
e
):
def
__init__
(
self
,
last_k_epochs
=
5
,
median
=
Tru
e
):
"""
"""
Args:
Args:
last_k_epochs (int): Use the time spent on last k epochs to estimate total time left.
last_k_epochs (int): Use the time spent on last k epochs to estimate total time left.
median (bool): Use
mean by default. If True, use the
median time spent on last k epochs.
median (bool): Use
the mean or
median time spent on last k epochs.
"""
"""
self
.
_times
=
deque
(
maxlen
=
last_k_epochs
)
self
.
_times
=
deque
(
maxlen
=
last_k_epochs
)
self
.
_median
=
median
self
.
_median
=
median
...
...
tensorpack/callbacks/prof.py
View file @
c51ce295
...
@@ -9,6 +9,7 @@ import time
...
@@ -9,6 +9,7 @@ import time
import
tensorflow
as
tf
import
tensorflow
as
tf
from
six.moves
import
map
,
queue
from
six.moves
import
map
,
queue
from
tensorflow.python.client
import
timeline
from
tensorflow.python.client
import
timeline
import
psutil
from
..tfutils.common
import
gpu_available_in_session
from
..tfutils.common
import
gpu_available_in_session
from
..utils
import
logger
from
..utils
import
logger
...
@@ -17,7 +18,7 @@ from ..utils.gpu import get_num_gpu
...
@@ -17,7 +18,7 @@ from ..utils.gpu import get_num_gpu
from
..utils.nvml
import
NVMLContext
from
..utils.nvml
import
NVMLContext
from
.base
import
Callback
from
.base
import
Callback
__all__
=
[
'GPUUtilizationTracker'
,
'GraphProfiler'
,
'PeakMemoryTracker'
]
__all__
=
[
'GPUUtilizationTracker'
,
'GraphProfiler'
,
'PeakMemoryTracker'
,
'GPUMemoryTracker'
,
'HostMemoryTracker'
]
class
GPUUtilizationTracker
(
Callback
):
class
GPUUtilizationTracker
(
Callback
):
...
@@ -205,11 +206,11 @@ class GraphProfiler(Callback):
...
@@ -205,11 +206,11 @@ class GraphProfiler(Callback):
self
.
trainer
.
monitors
.
put_event
(
evt
)
self
.
trainer
.
monitors
.
put_event
(
evt
)
class
Peak
MemoryTracker
(
Callback
):
class
GPU
MemoryTracker
(
Callback
):
"""
"""
Track peak memory used on each GPU device every epoch, by :mod:`tf.contrib.memory_stats`.
Track peak memory used on each GPU device every epoch, by :mod:`tf.contrib.memory_stats`.
The peak memory comes from the `
MaxBytesInUse` op, which might span
The peak memory comes from the `
`MaxBytesInUse`` op, which is the peak memory used
multiple session.run
.
in recent ``session.run`` calls
.
See https://github.com/tensorflow/tensorflow/pull/13107.
See https://github.com/tensorflow/tensorflow/pull/13107.
"""
"""
...
@@ -245,3 +246,28 @@ class PeakMemoryTracker(Callback):
...
@@ -245,3 +246,28 @@ class PeakMemoryTracker(Callback):
if
results
is
not
None
:
if
results
is
not
None
:
for
mem
,
dev
in
zip
(
results
,
self
.
_devices
):
for
mem
,
dev
in
zip
(
results
,
self
.
_devices
):
self
.
trainer
.
monitors
.
put_scalar
(
'PeakMemory(MB)'
+
dev
,
mem
/
1e6
)
self
.
trainer
.
monitors
.
put_scalar
(
'PeakMemory(MB)'
+
dev
,
mem
/
1e6
)
PeakMemoryTracker
=
GPUMemoryTracker
class
HostMemoryTracker
(
Callback
):
"""
Track free RAM on the host.
When triggered, it writes the size of free RAM into monitors.
"""
_chief_only
=
False
def
_setup_graph
(
self
):
logger
.
info
(
"[HostMemoryTracker] Free RAM in setup_graph() is {:.2f} GB."
.
format
(
self
.
_free_ram_gb
()))
def
_before_train
(
self
):
logger
.
info
(
"[HostMemoryTracker] Free RAM in before_train() is {:.2f} GB."
.
format
(
self
.
_free_ram_gb
()))
def
_trigger
(
self
):
ram_gb
=
self
.
_free_ram_gb
()
self
.
trainer
.
monitors
.
put_scalar
(
'HostFreeMemory (GB)'
,
ram_gb
)
def
_free_ram_gb
(
self
):
return
psutil
.
virtual_memory
()
.
available
/
1024
**
3
tensorpack/callbacks/steps.py
View file @
c51ce295
...
@@ -140,7 +140,7 @@ class SessionRunTimeout(Callback):
...
@@ -140,7 +140,7 @@ class SessionRunTimeout(Callback):
"""
"""
Args:
Args:
timeout_in_ms (int):
timeout_in_ms (int):
"""
"""
self
.
_timeout
=
int
(
timeout_in_ms
)
self
.
_timeout
=
int
(
timeout_in_ms
)
opt
=
tf
.
RunOptions
(
timeout_in_ms
=
timeout_in_ms
)
opt
=
tf
.
RunOptions
(
timeout_in_ms
=
timeout_in_ms
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment