fix back-compatibility of LMDBDataPoint (#215)

eaec5b2b · Yuxin Wu · 27841032 · eaec5b2b · eaec5b2b · eaec5b2b
Commit eaec5b2b authored Apr 01, 2017 by Yuxin Wu
Showing with 16 additions and 8 deletions

docs/conf.py docs/conf.py +4 -4

docs/tutorial/efficient-dataflow.md docs/tutorial/efficient-dataflow.md +4 -1

tensorpack/dataflow/format.py tensorpack/dataflow/format.py +8 -3

No files found.
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -152,15 +152,13 @@ todo_include_todos = True
 # a list of builtin themes.
 import sphinx_rtd_theme
 html_theme = "sphinx_rtd_theme"
+# Add any paths that contain custom themes here, relative to this directory.
 html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]

 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
-#html_theme_options = {}
-
-# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
+html_theme_options = {}

 # The name for this set of Sphinx documents.  If None, it defaults to
 # "<project> v<release> documentation".
@@ -229,6 +227,8 @@ html_show_copyright = True
 # This is the file name suffix for HTML files (e.g. ".xhtml").
 #html_file_suffix = None

+html_compact_lists = False
+
 # Language to be used for generating the HTML full-text search index.
 # Sphinx supports the following languages:
 #   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'

--- a/docs/tutorial/efficient-dataflow.md
+++ b/docs/tutorial/efficient-dataflow.md
@@ -11,7 +11,8 @@ We use ILSVRC12 training set, which contains 1.28 million images.
 The original images (JPEG compressed) are 140G in total.
 The average resolution is about 400x350 <sup>[[1]]</sup>.
 Following the [ResNet example](../examples/ResNet), we need images in their original resolution,
-so we'll read the original dataset instead of a down-sampled version.
+so we'll read the original dataset instead of a down-sampled version, and
+apply complicated preprocessing to it.
 We'll need to reach a speed of, roughly 1000 images per second, to keep GPUs busy.

 Note that the actual performance would depend on not only the disk, but also
@@ -163,6 +164,7 @@ Then we add necessary transformations:
    ds = AugmentImageComponent(ds, lots_of_augmentors)
    ds = BatchData(ds, 256)
 ```
+
 1. `LMDBDataPoint` deserialize the datapoints (from string to [jpeg_string, label])
 2. Use OpenCV to decode the first component into ndarray
 3. Apply augmentations to the ndarray
@@ -188,6 +190,7 @@ launch the underlying DataFlow in one independent process, and only parallelize
 (`PrefetchDataZMQ` is faster but not fork-safe, so the first prefetch has to be `PrefetchData`. This is [issue#138](https://github.com/ppwwyyxx/tensorpack/issues/138))

 Let me summarize what the above DataFlow does:
+
 1. One process reads LMDB file, shuffle them in a buffer and put them into a `multiprocessing.Queue` (used by `PrefetchData`).
 2. 25 processes take items from the queue, decode and process them into [image, label] pairs, and
 	 send them through ZMQ IPC pipes.

--- a/tensorpack/dataflow/format.py
+++ b/tensorpack/dataflow/format.py
@@ -156,14 +156,19 @@ class LMDBDataPoint(MapData):
    """ Read a LMDB file and produce deserialized values.
        This can work with :func:`tensorpack.dataflow.dftools.dump_dataflow_to_lmdb`. """

-    def __init__(self, lmdb_data):
+    def __init__(self, *args, **kwargs):
        """
        Args:
-            lmdb_data: a :class:`LMDBData` instance.
+            args, kwargs: Same as in :class:`LMDBData`.
        """
+        if isinstance(args[0], LMDBData):
+            ds = args[0]
+        else:
+            ds = LMDBData(*args, **kwargs)
+
        def f(dp):
            return loads(dp[1])
-        super(LMDBDataPoint, self).__init__(lmdb_data, f)
+        super(LMDBDataPoint, self).__init__(ds, f)


 def CaffeLMDB(lmdb_path, shuffle=True, keys=None):