logging for concurrency utilities

feaae168 · Yuxin Wu · c68686e6 · feaae168 · feaae168 · feaae168
Commit feaae168 authored Feb 10, 2017 by Yuxin Wu
7 changed files
--- a/docs/tutorial/efficient-data.md
+++ b/docs/tutorial/efficient-data.md
@@ -66,6 +66,18 @@ dataset instead of a down-sampled version here.
 The average resolution is about 400x350 <sup>[[1]]</sup>.
 The original images (JPEG compressed) are 140G in total.
+We start from a simple DataFlow:
+```python
+from tensorpack import *
+ds = dataset.ILSVRC12('/path/to/ILSVRC12', 'train', shuffle=True)
+ds = BatchData(ds, 256, use_list=True)
+TestDataSpeed(ds).start_test()
+```
+Here the first `ds` simply reads original images from filesystem, and second `ds` batch them, so
+that we can test the speed of this DataFlow in the unit of batch per second. By default `BatchData`
+will concatenate the data into an ndarray, but since images are originally of different shapes, we use
+`use_list=True` so that it just produces lists.
 [1]: #ref

--- a/examples/A3C-Gym/README.md
+++ b/examples/A3C-Gym/README.md
@@ -16,7 +16,7 @@ probably because of async issues.
 The pre-trained models are all trained with 4 GPUs for about 2 days.
 But note that multi-GPU doesn't give you obvious speedup here,
-because the bottleneck is not computation but data. On machines without huge memory, you may also need to
+because the bottleneck in this implementation is not computation but data. On machines without huge memory, you may also need to
 enable tcmalloc to keep training throughput more stable.
 Occasionally, processes may not get terminated completely, therefore it is suggested to use `systemd-run` to run any

--- a/tensorpack/callbacks/concurrency.py
+++ b/tensorpack/callbacks/concurrency.py
@@ -44,8 +44,12 @@ class StartProcOrThread(Callback):
            if isinstance(k, mp.Process):
                logger.info("Stopping {} ...".format(k.name))
                k.terminate()
-                k.join()
+                k.join(5.0)
+                if k.is_alive():
+                    logger.error("Cannot join process {}.".format(k.name))
            elif isinstance(k, StoppableThread):
                logger.info("Stopping {} ...".format(k.name))
                k.stop()
-                k.join()
+                k.join(5.0)
+                if k.is_alive():
+                    logger.error("Cannot join thread {}.".format(k.name))
--- a/tensorpack/dataflow/common.py
+++ b/tensorpack/dataflow/common.py
@@ -56,7 +56,7 @@ class BatchData(ProxyDataFlow):
    of the original datapoints.
    """
-    def __init__(self, ds, batch_size, remainder=False, allow_list=False):
+    def __init__(self, ds, batch_size, remainder=False, use_list=False):
        """
        Args:
            ds (DataFlow): Its components must be either scalars or :class:`np.ndarray`.
@@ -65,7 +65,7 @@ class BatchData(ProxyDataFlow):
            remainder (bool): whether to return the remaining data smaller than a batch_size.
                If set True, it will possibly generates a data point of a smaller batch size.
                Otherwise, all generated data are guranteed to have the same size.
-            allow_list (bool): if True, it will run faster by producing a list
+            use_list (bool): if True, it will run faster by producing a list
                of datapoints instead of an ndarray of datapoints, avoiding an
                extra copy.
        """
@@ -77,7 +77,7 @@ class BatchData(ProxyDataFlow):
                pass
        self.batch_size = batch_size
        self.remainder = remainder
-        self.allow_list = allow_list
+        self.use_list = use_list
    def size(self):
        ds_size = self.ds.size()
@@ -96,17 +96,17 @@ class BatchData(ProxyDataFlow):
        for data in self.ds.get_data():
            holder.append(data)
            if len(holder) == self.batch_size:
-                yield BatchData._aggregate_batch(holder, self.allow_list)
+                yield BatchData._aggregate_batch(holder, self.use_list)
                del holder[:]
        if self.remainder and len(holder) > 0:
-            yield BatchData._aggregate_batch(holder, self.allow_list)
+            yield BatchData._aggregate_batch(holder, self.use_list)
    @staticmethod
-    def _aggregate_batch(data_holder, allow_list):
+    def _aggregate_batch(data_holder, use_list):
        size = len(data_holder[0])
        result = []
        for k in range(size):
-            if allow_list:
+            if use_list:
                result.append(
                    [x[k] for x in data_holder])
            else:

--- a/tensorpack/dataflow/dataset/ilsvrc.py
+++ b/tensorpack/dataflow/dataset/ilsvrc.py
@@ -98,7 +98,7 @@ class ILSVRCMeta(object):
 class ILSVRC12(RNGDataFlow):
    """
-    Produces ILSVRC12 images of shape [h, w, 3(BGR)], and a label between [0, 999],
+    Produces uint8 ILSVRC12 images of shape [h, w, 3(BGR)], and a label between [0, 999],
    and optionally a bounding box of [xmin, ymin, xmax, ymax].
    """
    def __init__(self, dir, name, meta_dir=None, shuffle=None,

--- a/tensorpack/predict/concurrency.py
+++ b/tensorpack/predict/concurrency.py
@@ -6,7 +6,9 @@
 import multiprocessing
 import six
 from six.moves import queue, range
+import tensorflow as tf
+from ..utils import logger
 from ..utils.concurrency import DIE, StoppableThread
 from ..tfutils.modelutils import describe_model
 from .base import OfflinePredictor, AsyncPredictorBase
@@ -83,7 +85,13 @@ class PredictorWorkerThread(StoppableThread):
    def run(self):
        while not self.stopped():
            batched, futures = self.fetch_batch()
-            outputs = self.func(batched)
+            try:
+                outputs = self.func(batched)
+            except tf.errors.CancelledError:
+                for f in futures:
+                    f.cancel()
+                logger.warn("PredictorWorkerThread id={}, call was cancelled.".format(self.id))
+                return
            # print "Worker {} batched {} Queue {}".format(
            #         self.id, len(futures), self.queue.qsize())
            #  debug, for speed testing

--- a/tensorpack/train/multigpu.py
+++ b/tensorpack/train/multigpu.py
@@ -178,6 +178,7 @@ class AsyncMultiGPUTrainer(MultiGPUTrainer,
                self.sess.run([op])
                next(self.async_step_counter)
            th = LoopThread(f)
+            th.name = "AsyncLoopThread-{}".format(k)
            th.pause()
            th.start()
            self.training_threads.append(th)