Commit 869bc638 authored by Yuxin Wu's avatar Yuxin Wu

bind option in zmq tools. more notes about horovod.

parent 28f36c44
......@@ -101,7 +101,6 @@ def get_config(model, fake=False):
callbacks=callbacks,
steps_per_epoch=100 if args.fake else 1280000 // args.batch,
max_epoch=110,
nr_tower=nr_tower
)
......
......@@ -83,7 +83,7 @@ def fbresnet_augmentor(isTrain):
def get_imagenet_dataflow(
datadir, name, batch_size,
augmentors):
augmentors, parallel=None):
"""
See explanations in the tutorial:
http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
......@@ -92,11 +92,12 @@ def get_imagenet_dataflow(
assert datadir is not None
assert isinstance(augmentors, list)
isTrain = name == 'train'
cpu = min(40, multiprocessing.cpu_count())
if parallel is None:
parallel = min(40, multiprocessing.cpu_count())
if isTrain:
ds = dataset.ILSVRC12(datadir, name, shuffle=True)
ds = AugmentImageComponent(ds, augmentors, copy=False)
ds = PrefetchDataZMQ(ds, cpu)
ds = PrefetchDataZMQ(ds, parallel)
ds = BatchData(ds, batch_size, remainder=False)
else:
ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
......@@ -107,7 +108,7 @@ def get_imagenet_dataflow(
im = cv2.imread(fname, cv2.IMREAD_COLOR)
im = aug.augment(im)
return im, cls
ds = MultiThreadMapData(ds, cpu, mapf, buffer_size=2000, strict=True)
ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True)
ds = BatchData(ds, batch_size, remainder=True)
ds = PrefetchDataZMQ(ds, 1)
return ds
......
......@@ -9,10 +9,11 @@ import pprint
from termcolor import colored
from collections import deque, defaultdict
from six.moves import range, map
import tqdm
from .base import DataFlow, ProxyDataFlow, RNGDataFlow, DataFlowReentrantGuard
from ..utils import logger
from ..utils.utils import get_tqdm, get_rng
from ..utils.utils import get_tqdm, get_rng, get_tqdm_kwargs
from ..utils.develop import log_deprecated
__all__ = ['TestDataSpeed', 'PrintData', 'BatchData', 'BatchDataByShape', 'FixedSizeData', 'MapData',
......@@ -23,14 +24,16 @@ __all__ = ['TestDataSpeed', 'PrintData', 'BatchData', 'BatchDataByShape', 'Fixed
class TestDataSpeed(ProxyDataFlow):
""" Test the speed of some DataFlow """
def __init__(self, ds, size=5000):
def __init__(self, ds, size=5000, warmup=0):
"""
Args:
ds (DataFlow): the DataFlow to test.
size (int): number of datapoints to fetch.
warmup (int): warmup iterations
"""
super(TestDataSpeed, self).__init__(ds)
self.test_size = size
self.test_size = int(size)
self.warmup = int(warmup)
def get_data(self):
""" Will run testing at the beginning, then produce data normally. """
......@@ -43,10 +46,14 @@ class TestDataSpeed(ProxyDataFlow):
Start testing with a progress bar.
"""
self.ds.reset_state()
itr = self.ds.get_data()
if self.warmup:
for d in tqdm.trange(self.warmup, **get_tqdm_kwargs()):
next(itr)
# add smoothing for speed benchmark
with get_tqdm(total=self.test_size,
leave=True, smoothing=0.2) as pbar:
for idx, dp in enumerate(self.ds.get_data()):
for idx, dp in enumerate(itr):
pbar.update()
if idx == self.test_size - 1:
break
......
......@@ -33,10 +33,10 @@ def send_dataflow_zmq(df, addr, hwm=50, format=None, bind=False):
hwm (int): ZMQ high-water mark (buffer size)
format (str): The serialization format.
Default format would use :mod:`tensorpack.utils.serialize`.
An alternate format is 'zmq_op', used by https://github.com/tensorpack/zmq_ops.
An alternate format is 'zmq_ops', used by https://github.com/tensorpack/zmq_ops.
bind (bool): whether to bind or connect to the endpoint.
"""
assert format in [None, 'zmq_op']
assert format in [None, 'zmq_op', 'zmq_ops']
if format is None:
dump_fn = dumps
else:
......@@ -52,7 +52,8 @@ def send_dataflow_zmq(df, addr, hwm=50, format=None, bind=False):
socket.connect(addr)
try:
df.reset_state()
logger.info("Serving data to {} ...".format(addr))
logger.info("Serving data to {} with {} format ...".format(
addr, 'default' if format is None else 'zmq_ops'))
INTERVAL = 200
q = deque(maxlen=INTERVAL)
......@@ -60,7 +61,7 @@ def send_dataflow_zmq(df, addr, hwm=50, format=None, bind=False):
total = df.size()
except NotImplementedError:
total = 0
tqdm_args = get_tqdm_kwargs(leave=True)
tqdm_args = get_tqdm_kwargs(leave=True, smoothing=0.8)
tqdm_args['bar_format'] = tqdm_args['bar_format'] + "{postfix}"
while True:
with tqdm.trange(total, **tqdm_args) as pbar:
......@@ -87,24 +88,32 @@ class RemoteDataZMQ(DataFlow):
Attributes:
cnt1, cnt2 (int): number of data points received from addr1 and addr2
"""
def __init__(self, addr1, addr2=None, hwm=50):
def __init__(self, addr1, addr2=None, hwm=50, bind=True):
"""
Args:
addr1,addr2 (str): addr of the socket to connect to.
addr1,addr2 (str): addr of the zmq endpoint to connect to.
Use both if you need two protocols (e.g. both IPC and TCP).
I don't think you'll ever need 3.
hwm (int): ZMQ high-water mark (buffer size)
bind (bool): whether to connect or bind the endpoint
"""
assert addr1
self._addr1 = addr1
self._addr2 = addr2
self._hwm = int(hwm)
self._guard = DataFlowReentrantGuard()
self._bind = bind
def reset_state(self):
self.cnt1 = 0
self.cnt2 = 0
def bind_or_connect(self, socket, addr):
if self._bind:
socket.bind(addr)
else:
socket.connect(addr)
def get_data(self):
with self._guard:
try:
......@@ -112,7 +121,7 @@ class RemoteDataZMQ(DataFlow):
if self._addr2 is None:
socket = ctx.socket(zmq.PULL)
socket.set_hwm(self._hwm)
socket.bind(self._addr1)
self.bind_or_connect(socket, self._addr1)
while True:
dp = loads(socket.recv(copy=False).bytes)
......@@ -121,11 +130,11 @@ class RemoteDataZMQ(DataFlow):
else:
socket1 = ctx.socket(zmq.PULL)
socket1.set_hwm(self._hwm)
socket1.bind(self._addr1)
self.bind_or_connect(socket1, self._addr1)
socket2 = ctx.socket(zmq.PULL)
socket2.set_hwm(self._hwm)
socket2.bind(self._addr2)
self.bind_or_connect(socket2, self._addr2)
poller = zmq.Poller()
poller.register(socket1, zmq.POLLIN)
......
......@@ -375,7 +375,7 @@ class ZMQInput(TensorInput):
Recv tensors from a ZMQ endpoint, with ops from https://github.com/tensorpack/zmq_ops.
It works with :meth:`dataflow.remote.send_dataflow_zmq(format='zmq_op')`.
"""
def __init__(self, end_point, hwm):
def __init__(self, end_point, hwm, bind=True):
"""
Args:
end_point (str):
......@@ -383,6 +383,7 @@ class ZMQInput(TensorInput):
"""
self._end_point = end_point
self._hwm = int(hwm)
self._bind = bind
def fn():
ret = self._zmq_pull_socket.pull()
......@@ -401,7 +402,8 @@ class ZMQInput(TensorInput):
self._zmq_pull_socket = zmq_ops.ZMQPullSocket(
self._end_point,
[x.type for x in inputs_desc],
self._hwm)
hwm=self._hwm,
bind=self._bind)
class TFDatasetInput(FeedfreeInput):
......
......@@ -280,10 +280,16 @@ class HorovodTrainer(SingleCostTrainer):
--output-filename mylog -x LD_LIBRARY_PATH -x CUDA_VISIBLE_DEVICES=0,1,2,3 \
python train.py
(Add other environment variables you need by -x, e.g. PYTHONPATH, PATH)
Note:
1. If using all GPUs, you can always skip the `CUDA_VISIBLE_DEVICES` option.
1. Gradients are averaged among all processes.
2. If using all GPUs, you can always skip the `CUDA_VISIBLE_DEVICES` option.
3. Due to the use of MPI, training is less informative (no progress bar).
2. Due to the use of MPI, training is less informative (no progress bar).
4. MPI often fails to kill all processes. Be sure to check it.
"""
def __init__(self):
hvd.init()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment