Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
07e28eea
Commit
07e28eea
authored
Jul 15, 2020
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update about NVML and environment
parent
62ea40c8
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
47 additions
and
33 deletions
+47
-33
docs/tutorial/philosophy/dataflow.md
docs/tutorial/philosophy/dataflow.md
+2
-1
examples/A3C-Gym/train-atari.py
examples/A3C-Gym/train-atari.py
+1
-1
examples/FasterRCNN/README.md
examples/FasterRCNN/README.md
+1
-1
tensorpack/callbacks/prof.py
tensorpack/callbacks/prof.py
+8
-8
tensorpack/tfutils/common.py
tensorpack/tfutils/common.py
+4
-3
tensorpack/utils/gpu.py
tensorpack/utils/gpu.py
+29
-19
tensorpack/utils/nvml.py
tensorpack/utils/nvml.py
+2
-0
No files found.
docs/tutorial/philosophy/dataflow.md
View file @
07e28eea
...
...
@@ -160,7 +160,8 @@ or when you need to filter your data on the fly.
but inefficient for generic data type or numpy arrays.
Also, its implementation
[
does not always clean up the subprocesses correctly
](
https://github.com/pytorch/pytorch/issues/16608
)
.
PyTorch starts to improve on these bad assumptions (e.g., with
[
IterableDataset
](
https://github.com/pytorch/pytorch/pull/19228
)
).
PyTorch starts to improve on bad assumptions 1-3, (e.g., with IterableDataset).
But the interface still bears the history of these assumptions.
On the other hand, DataFlow:
1.
Is an iterator, not necessarily has a length or can be indexed. This is more generic.
...
...
examples/A3C-Gym/train-atari.py
View file @
07e28eea
...
...
@@ -173,7 +173,7 @@ class MySimulatorMaster(SimulatorMaster, Callback):
try
:
distrib
,
value
=
outputs
.
result
()
except
CancelledError
:
logger
.
info
(
"Client {} cancelled."
.
format
(
client
.
ident
))
logger
.
info
(
"Client {} cancelled."
.
format
(
client
.
ident
.
decode
(
'utf-8'
)
))
return
assert
np
.
all
(
np
.
isfinite
(
distrib
)),
distrib
action
=
np
.
random
.
choice
(
len
(
distrib
),
p
=
distrib
)
...
...
examples/FasterRCNN/README.md
View file @
07e28eea
...
...
@@ -116,7 +116,7 @@ Performance in [Detectron](https://github.com/facebookresearch/Detectron/) can b
We compare models that have identical training & inference cost between the two implementations.
Their numbers can be different due to small implementation details.
<a
id=
"ft2"
>
2
</a>
:
Our mAP is __7 point__ better
than the official model in
<a
id=
"ft2"
>
2
</a>
:
This model has __7 point__ better mAP
than the official model in
[
matterport/Mask_RCNN
](
https://github.com/matterport/Mask_RCNN/releases/tag/v2.0
)
which has the same architecture.
Our implementation is also
[
5x faster
](
https://github.com/tensorpack/benchmarks/tree/master/MaskRCNN
)
.
...
...
tensorpack/callbacks/prof.py
View file @
07e28eea
...
...
@@ -124,10 +124,10 @@ class GPUUtilizationTracker(Callback):
Args:
devices (list[int])
"""
try
:
with
NVMLContext
()
as
ctx
:
devices
=
[
ctx
.
device
(
i
)
for
i
in
devices
]
while
True
:
try
:
evt
.
wait
()
# start epoch
evt
.
clear
()
if
stop_evt
.
is_set
():
# or on exit
...
...
tensorpack/tfutils/common.py
View file @
07e28eea
...
...
@@ -10,6 +10,7 @@ import psutil
import
tensorflow
as
tf
import
numpy
as
np
import
tensorpack
from
..compat
import
tfv1
from
..utils.argtools
import
graph_memoized
from
..utils.utils
import
find_library_full_path
as
find_library
...
...
@@ -172,10 +173,10 @@ def collect_env_info():
data
=
[]
data
.
append
((
"sys.platform"
,
sys
.
platform
))
data
.
append
((
"Python"
,
sys
.
version
.
replace
(
"
\n
"
,
""
)))
data
.
append
((
"Tensorpack"
,
__git_version__
))
data
.
append
((
"Tensorpack"
,
__git_version__
+
" @"
+
os
.
path
.
dirname
(
tensorpack
.
__file__
)
))
data
.
append
((
"Numpy"
,
np
.
__version__
))
data
.
append
((
"TensorFlow"
,
tfv1
.
VERSION
+
"/"
+
tfv1
.
GIT_VERSION
))
data
.
append
((
"TensorFlow"
,
tfv1
.
VERSION
+
"/"
+
tfv1
.
GIT_VERSION
+
" @"
+
os
.
path
.
dirname
(
tf
.
__file__
)
))
data
.
append
((
"TF Compiler Version"
,
tfv1
.
COMPILER_VERSION
))
has_cuda
=
tf
.
test
.
is_built_with_cuda
()
data
.
append
((
"TF CUDA support"
,
has_cuda
))
...
...
@@ -221,7 +222,7 @@ def collect_env_info():
# Other important dependencies:
try
:
import
horovod
data
.
append
((
"Horovod"
,
horovod
.
__version__
))
data
.
append
((
"Horovod"
,
horovod
.
__version__
+
" @"
+
os
.
path
.
dirname
(
horovod
.
__file__
)
))
except
ImportError
:
pass
...
...
tensorpack/utils/gpu.py
View file @
07e28eea
...
...
@@ -43,19 +43,29 @@ def get_num_gpu():
logger
.
warn
(
message
+
"But TensorFlow was not built with CUDA support and could not use GPUs!"
)
return
ret
try
:
# Use NVML to query device properties
with
NVMLContext
()
as
ctx
:
nvml_num_dev
=
ctx
.
num_devices
()
except
Exception
:
nvml_num_dev
=
None
env
=
os
.
environ
.
get
(
'CUDA_VISIBLE_DEVICES'
,
None
)
if
env
:
return
warn_return
(
len
(
env
.
split
(
','
)),
"Found non-empty CUDA_VISIBLE_DEVICES. "
)
num_dev
=
len
(
env
.
split
(
','
))
assert
num_dev
<=
nvml_num_dev
,
\
"Only {} GPU(s) available, but CUDA_VISIBLE_DEVICES is set to {}"
.
format
(
nvml_num_dev
,
env
)
return
warn_return
(
num_dev
,
"Found non-empty CUDA_VISIBLE_DEVICES. "
)
output
,
code
=
subproc_call
(
"nvidia-smi -L"
,
timeout
=
5
)
if
code
==
0
:
output
=
output
.
decode
(
'utf-8'
)
return
warn_return
(
len
(
output
.
strip
()
.
split
(
'
\n
'
)),
"Found nvidia-smi. "
)
try
:
# Use NVML to query device properties
with
NVMLContext
()
as
ctx
:
return
warn_return
(
ctx
.
num_devices
(),
"NVML found nvidia devices. "
)
except
Exception
:
# Fallback
if
nvml_num_dev
is
not
None
:
return
warn_return
(
nvml_num_dev
,
"NVML found nvidia devices. "
)
# Fallback to TF
logger
.
info
(
"Loading local devices by TensorFlow ..."
)
try
:
...
...
tensorpack/utils/nvml.py
View file @
07e28eea
...
...
@@ -191,6 +191,8 @@ class NVMLContext(object):
Returns:
NvidiaDevice: single GPU device
"""
num_dev
=
self
.
num_devices
()
assert
idx
<
num_dev
,
"Cannot obtain device {}: NVML only found {} devices."
.
format
(
idx
,
num_dev
)
class
GpuDevice
(
Structure
):
pass
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment