Commit 91ea782a authored by Yuxin Wu's avatar Yuxin Wu

Small changes

parent 8dd254be
......@@ -89,7 +89,7 @@ Therefore, not every version of TF ≥ 1.6 supports every feature in this implem
1. TF < 1.10: `SyncBN` with NCCL will fail ([PR](https://github.com/tensorflow/tensorflow/pull/20360)).
1. TF 1.11 & 1.12: multithread inference will fail ([issue](https://github.com/tensorflow/tensorflow/issues/22750)).
Latest tensorpack will apply a workaround.
1. TF > 1.12: MKL inference will fail ([issue](https://github.com/tensorflow/tensorflow/issues/24650)).
1. TF 1.13: MKL inference will fail ([issue](https://github.com/tensorflow/tensorflow/issues/24650)).
1. TF > 1.12: Horovod training will fail ([issue](https://github.com/tensorflow/tensorflow/issues/25946)).
Latest tensorpack will apply a workaround.
......
......@@ -90,6 +90,7 @@ _C.DATA.NUM_CATEGORY = 0 # without the background class (e.g., 80 for COCO)
_C.DATA.CLASS_NAMES = [] # NUM_CLASS (NUM_CATEGORY+1) strings, the first is "BG".
# whether the coordinates in the annotations are absolute pixel values, or a relative value in [0, 1]
_C.DATA.ABSOLUTE_COORD = True
_C.DATA.NUM_WORKERS = 5 # number of data loading workers
# basemodel ----------------------
_C.BACKBONE.WEIGHTS = '' # /path/to/weights.npz
......
......@@ -369,10 +369,10 @@ def get_train_dataflow():
return ret
if cfg.TRAINER == 'horovod':
ds = MultiThreadMapData(ds, 5, preprocess)
ds = MultiThreadMapData(ds, cfg.DATA.NUM_WORKERS, preprocess)
# MPI does not like fork()
else:
ds = MultiProcessMapDataZMQ(ds, 10, preprocess)
ds = MultiProcessMapDataZMQ(ds, cfg.DATA.NUM_WORKERS, preprocess)
return ds
......
......@@ -251,7 +251,7 @@ class EvalCallback(Callback):
scores = DetectionDataset().eval_or_save_inference_results(
all_results, self._eval_dataset, output_file)
for k, v in scores.items():
self.trainer.monitors.put_scalar(k, v)
self.trainer.monitors.put_scalar(self._eval_dataset + '-' + k, v)
def _trigger_epoch(self):
if self.epoch_num in self.epochs_to_eval:
......
......@@ -7,7 +7,7 @@ import numpy as np
import os
import time
import tensorflow as tf
from six.moves import map
from six.moves import map, queue
from tensorflow.python.client import timeline
from ..tfutils.common import gpu_available_in_session
......@@ -23,7 +23,7 @@ __all__ = ['GPUUtilizationTracker', 'GraphProfiler', 'PeakMemoryTracker']
class GPUUtilizationTracker(Callback):
""" Summarize the average GPU utilization within an epoch.
It will start a process to run ``nvidia-smi`` every second
It will start a process to obtain GPU utilization through NVML every second
within the epoch (the trigger_epoch time was not included),
and write average utilization to monitors.
......@@ -74,7 +74,14 @@ class GPUUtilizationTracker(Callback):
def _trigger_epoch(self):
# Don't do this in after_epoch because
# before,after_epoch are supposed to be extremely fast by design.
stats = self._queue.get()
try:
stats = self._queue.get(timeout=60)
except queue.Empty:
if self._proc.is_alive():
raise RuntimeError("GPUUtilization.worker() is stuck. This is a bug.")
else:
raise RuntimeError("GPUUtilization.worker() process is killed unexpectedly.")
if stats == -1:
from ..train.base import StopTraining
raise StopTraining("GPUUtilizationTracker.worker has failed.")
......
......@@ -7,7 +7,7 @@ try:
# issue#1924 may happen on old systems
import cv2 # noqa
# cv2.setNumThreads(0)
if int(cv2.__version__.split('.')[0]) == 3:
if int(cv2.__version__.split('.')[0]) >= 3:
cv2.ocl.setUseOpenCL(False)
# check if cv is built with cuda or openmp
info = cv2.getBuildInformation().split('\n')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment