still use msgpack by default.... speed varies between situations; need more investigation

a2b26f2b · Yuxin Wu · d8d3ed1a · a2b26f2b · a2b26f2b · a2b26f2b
Commit a2b26f2b authored Jan 10, 2018 by Yuxin Wu
Hide whitespace changes
Inline Side-by-side

Showing with 30 additions and 13 deletions

CHANGES.md CHANGES.md +0 -2

docs/conf.py docs/conf.py +1 -1

requirements.txt requirements.txt +2 -1

tensorpack/utils/serialize.py tensorpack/utils/serialize.py +27 -9

No files found.
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -8,8 +8,6 @@ so you won't need to look at here very often.
 Here are a list of things that were changed, starting from an early version.
 TensorFlow itself also changed APIs before 1.0 and those are not listed here.

-+ [2018/01/09] Switched to pyarrow for faster serialization/deserialization on buffer-like objects.
-	Old serialized data may not be compatible with future versions.
 + [2017/10/21]
 	tensorpack is gradually switching to a new Trainer API.
 	The old API will keep working for a while. See [issue](https://github.com/ppwwyyxx/tensorpack/issues/458)

--- a/docs/conf.py
+++ b/docs/conf.py
@@ -30,7 +30,7 @@ MOCK_MODULES = ['tabulate', 'h5py',
                'scipy', 'scipy.misc', 'scipy.io',
                'tornado', 'tornado.concurrent',
                'horovod', 'horovod.tensorflow',
-                'pyarrow',
+                'pyarrow', 'msgpack', 'msgpack_numpy',
                'functools32']
 for mod_name in MOCK_MODULES:
    sys.modules[mod_name] = mock.Mock(name=mod_name)

--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,8 @@ six
 termcolor>=1.1
 tabulate>=0.7.7
 tqdm>4.11.1
-pyarrow
+msgpack
+msgpack-numpy>=0.4.0
 pyzmq>=16
 subprocess32; python_version < '3.0'
 functools32; python_version < '3.0'
--- a/tensorpack/utils/serialize.py
+++ b/tensorpack/utils/serialize.py
@@ -3,7 +3,14 @@
 # File: serialize.py
 # Author: Yuxin Wu <ppwwyyxxc@gmail.com>

-import pyarrow as pa
+import msgpack
+import msgpack_numpy
+msgpack_numpy.patch()
+
+try:
+    import pyarrow as pa
+except ImportError:
+    pass


 __all__ = ['loads', 'dumps']
@@ -12,6 +19,23 @@ __all__ = ['loads', 'dumps']
 def dumps(obj):
    """
    Serialize an object.
+    Returns:
+        Implementation-dependent bytes-like object
+    """
+    return msgpack.dumps(obj, use_bin_type=True)
+
+
+def loads(buf):
+    """
+    Args:
+        buf: the output of `dumps`.
+    """
+    return msgpack.loads(buf, encoding='utf-8')
+
+
+def dumps_pyarrow(obj):
+    """
+    Serialize an object.

    Returns:
        Implementation-dependent bytes-like object
@@ -19,15 +43,9 @@ def dumps(obj):
    return pa.serialize(obj).to_buffer()


-def loads(buf):
+def loads_pyarrow(buf):
    """
    Args:
        buf: the output of `dumps`.
    """
-    try:
-        return pa.deserialize(buf)
-    except pa.ArrowIOError:
-        # Handle data serialized by old version of tensorpack.
-        import msgpack
-        import msgpack_numpy as mn
-        return msgpack.unpackb(buf, object_hook=mn.decode, encoding='utf-8')
+    return pa.deserialize(buf)