Use msgpack instead of pyarrow for "serialization to disk".

805c44d5 · Yuxin Wu · 708e07b0 · 805c44d5 · 805c44d5 · 805c44d5
Commit 805c44d5 authored Aug 27, 2018 by Yuxin Wu
6 changed files
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -8,13 +8,15 @@ so you don't need to look at here very often.
 Here are a list of things that were changed, starting from an early version.
 TensorFlow itself also changes API and those are not listed here.
-+ [2018/04/05] msgpack is replaced by pyarrow. If you want old behavior,
+ [2018/08/27] msgpack is used again for "serialization to disk", because pyarrow
+  has no compatibility between versions. To use pyarrow instead, `export TENSORPACK_COMPATIBLE_SERIALIZE=pyarrow`.
+ [2018/04/05] msgpack is replaced by pyarrow in favor of its speed. If you want old behavior,
 	`export TENSORPACK_SERIALIZE=msgpack`.
 + [2018/03/20] `ModelDesc` starts to use simplified interfaces:
 	+ `_get_inputs()` renamed to `inputs()` and returns `tf.placeholder`s.
 	+ `build_graph(self, tensor1, tensor2)` returns the cost tensor directly.
 	+ `_get_optimizer()` renamed to `optimizer()`.
-	Old interface will still be available, but new ones are recommended.
+	Old interface will still be available for a while, but new ones are recommended.
 + [2018/03/12] `JSONWriter` use a different file name, and will not automatically restore epoch number.
 	`AutoResumeTrainConfig` was added to support resuming.
 + [2017/10/21]

--- a/docs/modules/utils.rst
+++ b/docs/modules/utils.rst
@@ -51,6 +51,15 @@ tensorpack.utils.serialize module
    :undoc-members:
    :show-inheritance:
+tensorpack.utils.compatible_serialize module
+---------------------------------
+.. automodule:: tensorpack.utils.compatible_serialize
+    :members:
+    :undoc-members:
+    :show-inheritance:
 tensorpack.utils.stats module
 -----------------------------

--- a/setup.py
+++ b/setup.py
@@ -29,6 +29,8 @@ setup(
        "tabulate>=0.7.7",
        "tqdm>4.11.1",
        "pyarrow>=0.9.0",
+        "msgpack>=0.5.2",
+        "msgpack-numpy>=0.4.0",
        "pyzmq>=16",
        "subprocess32; python_version < '3.0'",
        "functools32; python_version < '3.0'",

--- a/tensorpack/dataflow/format.py
+++ b/tensorpack/dataflow/format.py
@@ -11,7 +11,7 @@ from ..utils import logger
 from ..utils.utils import get_tqdm
 from ..utils.timer import timed_operation
 from ..utils.loadcaffe import get_caffe_pb
-from ..utils.serialize import loads
+from ..utils.compatible_serialize import loads
 from ..utils.argtools import log_once
 from ..utils.develop import log_deprecated
 from .base import RNGDataFlow, DataFlow, DataFlowReentrantGuard

--- a/tensorpack/dataflow/serialize.py
+++ b/tensorpack/dataflow/serialize.py
@@ -7,7 +7,7 @@ from collections import defaultdict
 from ..utils.utils import get_tqdm
 from ..utils import logger
-from ..utils.serialize import dumps, loads
+from ..utils.compatible_serialize import dumps, loads
 from .base import DataFlow
 from .format import LMDBData, HDF5Data
@@ -46,7 +46,7 @@ class LMDBSerializer():
        if isdir:
            assert not os.path.isfile(os.path.join(path, 'data.mdb')), "LMDB file exists!"
        else:
-            assert not os.path.isfile(path), "LMDB file exists!"
+            assert not os.path.isfile(path), "LMDB file {} exists!".format(path)
        db = lmdb.open(path, subdir=isdir,
                       map_size=1099511627776 * 2, readonly=False,
                       meminit=False, map_async=True)    # need sync() at the end
@@ -126,7 +126,7 @@ class TFRecordSerializer():
            df (DataFlow): the DataFlow to serialize.
            path (str): output tfrecord file.
        """
-        if os.environ.get('TENSORPACK_SERIALIZE', None) == 'msgpack':
+        if os.environ.get('TENSORPACK_COMPATIBLE_SERIALIZE', 'msgpack') == 'msgpack':
            def _dumps(dp):
                return dumps(dp)
        else:

--- a/tensorpack/utils/serialize.py
+++ b/tensorpack/utils/serialize.py
@@ -2,6 +2,8 @@
 # File: serialize.py
 import os
+import pyarrow as pa
 from .develop import create_dummy_func
 __all__ = ['loads', 'dumps']
@@ -44,20 +46,11 @@ def loads_pyarrow(buf):
    return pa.deserialize(buf)
-try:
-    # fixed in pyarrow 0.9: https://github.com/apache/arrow/pull/1223#issuecomment-359895666
-    import pyarrow as pa
-except ImportError:
-    pa = None
-    dumps_pyarrow = create_dummy_func('dumps_pyarrow', ['pyarrow'])  # noqa
-    loads_pyarrow = create_dummy_func('loads_pyarrow', ['pyarrow'])  # noqa
 try:
    import msgpack
    import msgpack_numpy
    msgpack_numpy.patch()
 except ImportError:
-    assert pa is not None, "pyarrow is a dependency of tensorpack!"
    loads_msgpack = create_dummy_func(  # noqa
        'loads_msgpack', ['msgpack', 'msgpack_numpy'])
    dumps_msgpack = create_dummy_func(  # noqa