Commit eaec5b2b authored by Yuxin Wu's avatar Yuxin Wu

fix back-compatibility of LMDBDataPoint (#215)

parent 27841032
...@@ -152,15 +152,13 @@ todo_include_todos = True ...@@ -152,15 +152,13 @@ todo_include_todos = True
# a list of builtin themes. # a list of builtin themes.
import sphinx_rtd_theme import sphinx_rtd_theme
html_theme = "sphinx_rtd_theme" html_theme = "sphinx_rtd_theme"
# Add any paths that contain custom themes here, relative to this directory.
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
# Theme options are theme-specific and customize the look and feel of a theme # Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the # further. For a list of options available for each theme, see the
# documentation. # documentation.
#html_theme_options = {} html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to # The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation". # "<project> v<release> documentation".
...@@ -229,6 +227,8 @@ html_show_copyright = True ...@@ -229,6 +227,8 @@ html_show_copyright = True
# This is the file name suffix for HTML files (e.g. ".xhtml"). # This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None #html_file_suffix = None
html_compact_lists = False
# Language to be used for generating the HTML full-text search index. # Language to be used for generating the HTML full-text search index.
# Sphinx supports the following languages: # Sphinx supports the following languages:
# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
......
...@@ -11,7 +11,8 @@ We use ILSVRC12 training set, which contains 1.28 million images. ...@@ -11,7 +11,8 @@ We use ILSVRC12 training set, which contains 1.28 million images.
The original images (JPEG compressed) are 140G in total. The original images (JPEG compressed) are 140G in total.
The average resolution is about 400x350 <sup>[[1]]</sup>. The average resolution is about 400x350 <sup>[[1]]</sup>.
Following the [ResNet example](../examples/ResNet), we need images in their original resolution, Following the [ResNet example](../examples/ResNet), we need images in their original resolution,
so we'll read the original dataset instead of a down-sampled version. so we'll read the original dataset instead of a down-sampled version, and
apply complicated preprocessing to it.
We'll need to reach a speed of, roughly 1000 images per second, to keep GPUs busy. We'll need to reach a speed of, roughly 1000 images per second, to keep GPUs busy.
Note that the actual performance would depend on not only the disk, but also Note that the actual performance would depend on not only the disk, but also
...@@ -163,6 +164,7 @@ Then we add necessary transformations: ...@@ -163,6 +164,7 @@ Then we add necessary transformations:
ds = AugmentImageComponent(ds, lots_of_augmentors) ds = AugmentImageComponent(ds, lots_of_augmentors)
ds = BatchData(ds, 256) ds = BatchData(ds, 256)
``` ```
1. `LMDBDataPoint` deserialize the datapoints (from string to [jpeg_string, label]) 1. `LMDBDataPoint` deserialize the datapoints (from string to [jpeg_string, label])
2. Use OpenCV to decode the first component into ndarray 2. Use OpenCV to decode the first component into ndarray
3. Apply augmentations to the ndarray 3. Apply augmentations to the ndarray
...@@ -188,6 +190,7 @@ launch the underlying DataFlow in one independent process, and only parallelize ...@@ -188,6 +190,7 @@ launch the underlying DataFlow in one independent process, and only parallelize
(`PrefetchDataZMQ` is faster but not fork-safe, so the first prefetch has to be `PrefetchData`. This is [issue#138](https://github.com/ppwwyyxx/tensorpack/issues/138)) (`PrefetchDataZMQ` is faster but not fork-safe, so the first prefetch has to be `PrefetchData`. This is [issue#138](https://github.com/ppwwyyxx/tensorpack/issues/138))
Let me summarize what the above DataFlow does: Let me summarize what the above DataFlow does:
1. One process reads LMDB file, shuffle them in a buffer and put them into a `multiprocessing.Queue` (used by `PrefetchData`). 1. One process reads LMDB file, shuffle them in a buffer and put them into a `multiprocessing.Queue` (used by `PrefetchData`).
2. 25 processes take items from the queue, decode and process them into [image, label] pairs, and 2. 25 processes take items from the queue, decode and process them into [image, label] pairs, and
send them through ZMQ IPC pipes. send them through ZMQ IPC pipes.
......
...@@ -156,14 +156,19 @@ class LMDBDataPoint(MapData): ...@@ -156,14 +156,19 @@ class LMDBDataPoint(MapData):
""" Read a LMDB file and produce deserialized values. """ Read a LMDB file and produce deserialized values.
This can work with :func:`tensorpack.dataflow.dftools.dump_dataflow_to_lmdb`. """ This can work with :func:`tensorpack.dataflow.dftools.dump_dataflow_to_lmdb`. """
def __init__(self, lmdb_data): def __init__(self, *args, **kwargs):
""" """
Args: Args:
lmdb_data: a :class:`LMDBData` instance. args, kwargs: Same as in :class:`LMDBData`.
""" """
if isinstance(args[0], LMDBData):
ds = args[0]
else:
ds = LMDBData(*args, **kwargs)
def f(dp): def f(dp):
return loads(dp[1]) return loads(dp[1])
super(LMDBDataPoint, self).__init__(lmdb_data, f) super(LMDBDataPoint, self).__init__(ds, f)
def CaffeLMDB(lmdb_path, shuffle=True, keys=None): def CaffeLMDB(lmdb_path, shuffle=True, keys=None):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment