Small changes

91ea782a · Yuxin Wu · 8dd254be · 91ea782a · 91ea782a · 91ea782a
Commit 91ea782a authored Apr 14, 2019 by Yuxin Wu
6 changed files
--- a/examples/FasterRCNN/NOTES.md
+++ b/examples/FasterRCNN/NOTES.md
@@ -89,7 +89,7 @@ Therefore, not every version of TF ≥ 1.6 supports every feature in this implem
 1. TF < 1.10: `SyncBN` with NCCL will fail ([PR](https://github.com/tensorflow/tensorflow/pull/20360)).
 1. TF 1.11 & 1.12: multithread inference will fail ([issue](https://github.com/tensorflow/tensorflow/issues/22750)).
   Latest tensorpack will apply a workaround.
-1. TF > 1.12: MKL inference will fail ([issue](https://github.com/tensorflow/tensorflow/issues/24650)).
+1. TF 1.13: MKL inference will fail ([issue](https://github.com/tensorflow/tensorflow/issues/24650)).
 1. TF > 1.12: Horovod training will fail ([issue](https://github.com/tensorflow/tensorflow/issues/25946)).
   Latest tensorpack will apply a workaround.


--- a/examples/FasterRCNN/config.py
+++ b/examples/FasterRCNN/config.py
@@ -90,6 +90,7 @@ _C.DATA.NUM_CATEGORY = 0  # without the background class (e.g., 80 for COCO)
 _C.DATA.CLASS_NAMES = []  # NUM_CLASS (NUM_CATEGORY+1) strings, the first is "BG".
 # whether the coordinates in the annotations are absolute pixel values, or a relative value in [0, 1]
 _C.DATA.ABSOLUTE_COORD = True
+_C.DATA.NUM_WORKERS = 5  # number of data loading workers

 # basemodel ----------------------
 _C.BACKBONE.WEIGHTS = ''   # /path/to/weights.npz

--- a/examples/FasterRCNN/data.py
+++ b/examples/FasterRCNN/data.py
@@ -369,10 +369,10 @@ def get_train_dataflow():
        return ret

    if cfg.TRAINER == 'horovod':
-        ds = MultiThreadMapData(ds, 5, preprocess)
+        ds = MultiThreadMapData(ds, cfg.DATA.NUM_WORKERS, preprocess)
        # MPI does not like fork()
    else:
-        ds = MultiProcessMapDataZMQ(ds, 10, preprocess)
+        ds = MultiProcessMapDataZMQ(ds, cfg.DATA.NUM_WORKERS, preprocess)
    return ds



--- a/examples/FasterRCNN/eval.py
+++ b/examples/FasterRCNN/eval.py
@@ -251,7 +251,7 @@ class EvalCallback(Callback):
        scores = DetectionDataset().eval_or_save_inference_results(
            all_results, self._eval_dataset, output_file)
        for k, v in scores.items():
-            self.trainer.monitors.put_scalar(k, v)
+            self.trainer.monitors.put_scalar(self._eval_dataset + '-' + k, v)

    def _trigger_epoch(self):
        if self.epoch_num in self.epochs_to_eval:

--- a/tensorpack/callbacks/prof.py
+++ b/tensorpack/callbacks/prof.py
@@ -7,7 +7,7 @@ import numpy as np
 import os
 import time
 import tensorflow as tf
-from six.moves import map
+from six.moves import map, queue
 from tensorflow.python.client import timeline

 from ..tfutils.common import gpu_available_in_session
@@ -23,7 +23,7 @@ __all__ = ['GPUUtilizationTracker', 'GraphProfiler', 'PeakMemoryTracker']
 class GPUUtilizationTracker(Callback):
    """ Summarize the average GPU utilization within an epoch.

-    It will start a process to run ``nvidia-smi`` every second
+    It will start a process to obtain GPU utilization through NVML every second
    within the epoch (the trigger_epoch time was not included),
    and write average utilization to monitors.

@@ -74,7 +74,14 @@ class GPUUtilizationTracker(Callback):
    def _trigger_epoch(self):
        # Don't do this in after_epoch because
        # before,after_epoch are supposed to be extremely fast by design.
-        stats = self._queue.get()
+        try:
+            stats = self._queue.get(timeout=60)
+        except queue.Empty:
+            if self._proc.is_alive():
+                raise RuntimeError("GPUUtilization.worker() is stuck. This is a bug.")
+            else:
+                raise RuntimeError("GPUUtilization.worker() process is killed unexpectedly.")
+
        if stats == -1:
            from ..train.base import StopTraining
            raise StopTraining("GPUUtilizationTracker.worker has failed.")

--- a/tensorpack/libinfo.py
+++ b/tensorpack/libinfo.py
@@ -7,7 +7,7 @@ try:
    # issue#1924 may happen on old systems
    import cv2  # noqa
    # cv2.setNumThreads(0)
-    if int(cv2.__version__.split('.')[0]) == 3:
+    if int(cv2.__version__.split('.')[0]) >= 3:
        cv2.ocl.setUseOpenCL(False)
    # check if cv is built with cuda or openmp
    info = cv2.getBuildInformation().split('\n')