add allow_list to BatchData. Default to False for now to test

da5e9e66 · Yuxin Wu · 7c694aca · da5e9e66
Commit da5e9e66 authored Feb 08, 2017 by Yuxin Wu
Hide whitespace changes
Inline Side-by-side

Showing with 33 additions and 20 deletions

tensorpack/dataflow/common.py tensorpack/dataflow/common.py +33 -20

No files found.
--- a/tensorpack/dataflow/common.py
+++ b/tensorpack/dataflow/common.py
@@ -47,9 +47,14 @@ class TestDataSpeed(ProxyDataFlow):

 class BatchData(ProxyDataFlow):
    """
-    Group data into batches.
+    Concat datapoints into batches.
+    It produces datapoints of the same number of components as ``ds``, but
+    each component has one new extra dimension of size ``batch_size``.
+    The new component can be a list of the original datapoints, or an ndarray
+    of the original datapoints.
    """
-    def __init__(self, ds, batch_size, remainder=False):
+
+    def __init__(self, ds, batch_size, remainder=False, allow_list=False):
        """
        Args:
            ds (DataFlow): Its components must be either scalars or :class:`np.ndarray`.
@@ -58,6 +63,9 @@ class BatchData(ProxyDataFlow):
            remainder (bool): whether to return the remaining data smaller than a batch_size.
                If set True, it will possibly generates a data point of a smaller batch size.
                Otherwise, all generated data are guranteed to have the same size.
+            allow_list (bool): if True, it will run faster by producing a list
+                of datapoints instead of an ndarray of datapoints, avoiding an
+                extra copy.
        """
        super(BatchData, self).__init__(ds)
        if not remainder:
@@ -67,6 +75,7 @@ class BatchData(ProxyDataFlow):
                pass
        self.batch_size = batch_size
        self.remainder = remainder
+        self.allow_list = allow_list

    def size(self):
        ds_size = self.ds.size()
@@ -85,32 +94,36 @@ class BatchData(ProxyDataFlow):
        for data in self.ds.get_data():
            holder.append(data)
            if len(holder) == self.batch_size:
-                yield BatchData._aggregate_batch(holder)
+                yield BatchData._aggregate_batch(holder, self.allow_list)
                del holder[:]
        if self.remainder and len(holder) > 0:
-            yield BatchData._aggregate_batch(holder)
+            yield BatchData._aggregate_batch(holder, self.allow_list)

    @staticmethod
-    def _aggregate_batch(data_holder):
+    def _aggregate_batch(data_holder, allow_list):
        size = len(data_holder[0])
        result = []
        for k in range(size):
-            dt = data_holder[0][k]
-            if type(dt) in [int, bool]:
-                tp = 'int32'
-            elif type(dt) == float:
-                tp = 'float32'
-            else:
-                tp = dt.dtype
-            try:
+            if allow_list:
                result.append(
-                    np.array([x[k] for x in data_holder], dtype=tp))
-            except KeyboardInterrupt:
-                raise
-            except:
-                logger.exception("Cannot batch data. Perhaps they are of inconsistent shape?")
-                import IPython as IP
-                IP.embed(config=IP.terminal.ipapp.load_default_config())
+                    [x[k] for x in data_holder])
+            else:
+                dt = data_holder[0][k]
+                if type(dt) in [int, bool]:
+                    tp = 'int32'
+                elif type(dt) == float:
+                    tp = 'float32'
+                else:
+                    tp = dt.dtype
+                try:
+                    result.append(
+                        np.array([x[k] for x in data_holder], dtype=tp))
+                except KeyboardInterrupt:
+                    raise
+                except:
+                    logger.exception("Cannot batch data. Perhaps they are of inconsistent shape?")
+                    import IPython as IP
+                    IP.embed(config=IP.terminal.ipapp.load_default_config())
        return result