Commit 91ea782a authored by Yuxin Wu's avatar Yuxin Wu

Small changes

parent 8dd254be
...@@ -89,7 +89,7 @@ Therefore, not every version of TF ≥ 1.6 supports every feature in this implem ...@@ -89,7 +89,7 @@ Therefore, not every version of TF ≥ 1.6 supports every feature in this implem
1. TF < 1.10: `SyncBN` with NCCL will fail ([PR](https://github.com/tensorflow/tensorflow/pull/20360)). 1. TF < 1.10: `SyncBN` with NCCL will fail ([PR](https://github.com/tensorflow/tensorflow/pull/20360)).
1. TF 1.11 & 1.12: multithread inference will fail ([issue](https://github.com/tensorflow/tensorflow/issues/22750)). 1. TF 1.11 & 1.12: multithread inference will fail ([issue](https://github.com/tensorflow/tensorflow/issues/22750)).
Latest tensorpack will apply a workaround. Latest tensorpack will apply a workaround.
1. TF > 1.12: MKL inference will fail ([issue](https://github.com/tensorflow/tensorflow/issues/24650)). 1. TF 1.13: MKL inference will fail ([issue](https://github.com/tensorflow/tensorflow/issues/24650)).
1. TF > 1.12: Horovod training will fail ([issue](https://github.com/tensorflow/tensorflow/issues/25946)). 1. TF > 1.12: Horovod training will fail ([issue](https://github.com/tensorflow/tensorflow/issues/25946)).
Latest tensorpack will apply a workaround. Latest tensorpack will apply a workaround.
......
...@@ -90,6 +90,7 @@ _C.DATA.NUM_CATEGORY = 0 # without the background class (e.g., 80 for COCO) ...@@ -90,6 +90,7 @@ _C.DATA.NUM_CATEGORY = 0 # without the background class (e.g., 80 for COCO)
_C.DATA.CLASS_NAMES = [] # NUM_CLASS (NUM_CATEGORY+1) strings, the first is "BG". _C.DATA.CLASS_NAMES = [] # NUM_CLASS (NUM_CATEGORY+1) strings, the first is "BG".
# whether the coordinates in the annotations are absolute pixel values, or a relative value in [0, 1] # whether the coordinates in the annotations are absolute pixel values, or a relative value in [0, 1]
_C.DATA.ABSOLUTE_COORD = True _C.DATA.ABSOLUTE_COORD = True
_C.DATA.NUM_WORKERS = 5 # number of data loading workers
# basemodel ---------------------- # basemodel ----------------------
_C.BACKBONE.WEIGHTS = '' # /path/to/weights.npz _C.BACKBONE.WEIGHTS = '' # /path/to/weights.npz
......
...@@ -369,10 +369,10 @@ def get_train_dataflow(): ...@@ -369,10 +369,10 @@ def get_train_dataflow():
return ret return ret
if cfg.TRAINER == 'horovod': if cfg.TRAINER == 'horovod':
ds = MultiThreadMapData(ds, 5, preprocess) ds = MultiThreadMapData(ds, cfg.DATA.NUM_WORKERS, preprocess)
# MPI does not like fork() # MPI does not like fork()
else: else:
ds = MultiProcessMapDataZMQ(ds, 10, preprocess) ds = MultiProcessMapDataZMQ(ds, cfg.DATA.NUM_WORKERS, preprocess)
return ds return ds
......
...@@ -251,7 +251,7 @@ class EvalCallback(Callback): ...@@ -251,7 +251,7 @@ class EvalCallback(Callback):
scores = DetectionDataset().eval_or_save_inference_results( scores = DetectionDataset().eval_or_save_inference_results(
all_results, self._eval_dataset, output_file) all_results, self._eval_dataset, output_file)
for k, v in scores.items(): for k, v in scores.items():
self.trainer.monitors.put_scalar(k, v) self.trainer.monitors.put_scalar(self._eval_dataset + '-' + k, v)
def _trigger_epoch(self): def _trigger_epoch(self):
if self.epoch_num in self.epochs_to_eval: if self.epoch_num in self.epochs_to_eval:
......
...@@ -7,7 +7,7 @@ import numpy as np ...@@ -7,7 +7,7 @@ import numpy as np
import os import os
import time import time
import tensorflow as tf import tensorflow as tf
from six.moves import map from six.moves import map, queue
from tensorflow.python.client import timeline from tensorflow.python.client import timeline
from ..tfutils.common import gpu_available_in_session from ..tfutils.common import gpu_available_in_session
...@@ -23,7 +23,7 @@ __all__ = ['GPUUtilizationTracker', 'GraphProfiler', 'PeakMemoryTracker'] ...@@ -23,7 +23,7 @@ __all__ = ['GPUUtilizationTracker', 'GraphProfiler', 'PeakMemoryTracker']
class GPUUtilizationTracker(Callback): class GPUUtilizationTracker(Callback):
""" Summarize the average GPU utilization within an epoch. """ Summarize the average GPU utilization within an epoch.
It will start a process to run ``nvidia-smi`` every second It will start a process to obtain GPU utilization through NVML every second
within the epoch (the trigger_epoch time was not included), within the epoch (the trigger_epoch time was not included),
and write average utilization to monitors. and write average utilization to monitors.
...@@ -74,7 +74,14 @@ class GPUUtilizationTracker(Callback): ...@@ -74,7 +74,14 @@ class GPUUtilizationTracker(Callback):
def _trigger_epoch(self): def _trigger_epoch(self):
# Don't do this in after_epoch because # Don't do this in after_epoch because
# before,after_epoch are supposed to be extremely fast by design. # before,after_epoch are supposed to be extremely fast by design.
stats = self._queue.get() try:
stats = self._queue.get(timeout=60)
except queue.Empty:
if self._proc.is_alive():
raise RuntimeError("GPUUtilization.worker() is stuck. This is a bug.")
else:
raise RuntimeError("GPUUtilization.worker() process is killed unexpectedly.")
if stats == -1: if stats == -1:
from ..train.base import StopTraining from ..train.base import StopTraining
raise StopTraining("GPUUtilizationTracker.worker has failed.") raise StopTraining("GPUUtilizationTracker.worker has failed.")
......
...@@ -7,7 +7,7 @@ try: ...@@ -7,7 +7,7 @@ try:
# issue#1924 may happen on old systems # issue#1924 may happen on old systems
import cv2 # noqa import cv2 # noqa
# cv2.setNumThreads(0) # cv2.setNumThreads(0)
if int(cv2.__version__.split('.')[0]) == 3: if int(cv2.__version__.split('.')[0]) >= 3:
cv2.ocl.setUseOpenCL(False) cv2.ocl.setUseOpenCL(False)
# check if cv is built with cuda or openmp # check if cv is built with cuda or openmp
info = cv2.getBuildInformation().split('\n') info = cv2.getBuildInformation().split('\n')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment