Commit 43b04fc8 authored by Yuxin Wu's avatar Yuxin Wu

update docs

parent 1d3ab162
...@@ -34,3 +34,5 @@ It does the following extra things for you: ...@@ -34,3 +34,5 @@ It does the following extra things for you:
2. Random parameter generation and the actual augmentation is separated. This allows you to apply the 2. Random parameter generation and the actual augmentation is separated. This allows you to apply the
same transformation to several images together (with `AugmentImageComponents`), same transformation to several images together (with `AugmentImageComponents`),
which is essential to tasks such as segmentation. which is essential to tasks such as segmentation.
Or apply the same transformations to images plus coordinate labels (with `AugmentImageCoordinates`),
which is essential to tasks such as detection and localization.
...@@ -93,7 +93,7 @@ class AugmentImageComponent(MapDataComponent): ...@@ -93,7 +93,7 @@ class AugmentImageComponent(MapDataComponent):
class AugmentImageCoordinates(MapData): class AugmentImageCoordinates(MapData):
""" """
Apply image augmentors on an image and set of coordinates. Apply image augmentors on an image and a list of coordinates.
""" """
def __init__(self, ds, augmentors, img_index=0, coords_index=1, copy=True): def __init__(self, ds, augmentors, img_index=0, coords_index=1, copy=True):
""" """
......
...@@ -46,18 +46,20 @@ class PrefetchProcess(mp.Process): ...@@ -46,18 +46,20 @@ class PrefetchProcess(mp.Process):
class PrefetchData(ProxyDataFlow): class PrefetchData(ProxyDataFlow):
""" """
Prefetch data from a DataFlow using Python multiprocessing utilities. Prefetch data from a DataFlow using Python multiprocessing utilities.
It will fork the process calling :meth:`__init__`, , collect datapoints from `ds` in each
process by a Python :class:`multiprocessing.Queue`.
Note: Note:
1. This is significantly slower than :class:`PrefetchDataZMQ` when data is large. 1. The underlying dataflow worker will be forked multiple times When ``nr_proc>1``.
2. When nesting like this: ``PrefetchDataZMQ(PrefetchData(df, nr_proc=a), nr_proc=b)``.
A total of ``a`` instances of ``df`` worker processes will be created.
This is different from the behavior of :class`PrefetchDataZMQ`
3. The underlying dataflow worker will be forked multiple times When ``nr_proc>1``.
As a result, unless the underlying dataflow is fully shuffled, the data distribution As a result, unless the underlying dataflow is fully shuffled, the data distribution
produced by this dataflow will be wrong. produced by this dataflow will be wrong.
(e.g. you are likely to see duplicated datapoints at the beginning) (e.g. you are likely to see duplicated datapoints at the beginning)
2. This is significantly slower than :class:`PrefetchDataZMQ` when data is large.
3. When nesting like this: ``PrefetchDataZMQ(PrefetchData(df, nr_proc=a), nr_proc=b)``.
A total of ``a`` instances of ``df`` worker processes will be created.
This is different from the behavior of :class:`PrefetchDataZMQ`
""" """
def __init__(self, ds, nr_prefetch, nr_proc=1): def __init__(self, ds, nr_prefetch, nr_proc):
""" """
Args: Args:
ds (DataFlow): input DataFlow. ds (DataFlow): input DataFlow.
...@@ -113,21 +115,26 @@ class PrefetchProcessZMQ(mp.Process): ...@@ -113,21 +115,26 @@ class PrefetchProcessZMQ(mp.Process):
class PrefetchDataZMQ(ProxyDataFlow): class PrefetchDataZMQ(ProxyDataFlow):
""" """
Prefetch data from a DataFlow using multiple processes, with ZMQ for Prefetch data from a DataFlow using multiple processes, with ZeroMQ for
communication. communication.
It will fork the process calling :meth:`reset_state()`,
A local directory is needed to put the ZMQ pipes. collect datapoints from `ds` in each process by ZeroMQ IPC pipe.
You can set this with env var ``$TENSORPACK_PIPEDIR`` if you're running on non-local FS such as NFS or GlusterFS.
Note: Note:
1. Once :meth:`reset_state` is called, this dataflow becomes not fork-safe. 1. The underlying dataflow worker will be forked multiple times When ``nr_proc>1``.
2. When nesting like this: ``PrefetchDataZMQ(PrefetchDataZMQ(df, nr_proc=a), nr_proc=b)``.
A total of ``a * b`` instances of ``df`` worker processes will be created.
Also in this case some zmq pipes cannot be cleaned at exit.
3. The underlying dataflow worker will be forked multiple times When ``nr_proc>1``.
As a result, unless the underlying dataflow is fully shuffled, the data distribution As a result, unless the underlying dataflow is fully shuffled, the data distribution
produced by this dataflow will be wrong. produced by this dataflow will be wrong.
(e.g. you are likely to see duplicated datapoints at the beginning) (e.g. you are likely to see duplicated datapoints at the beginning)
2. Once :meth:`reset_state` is called, this dataflow becomes not fork-safe.
i.e., if you fork an already reset instance of this dataflow,
it won't be usable in the forked process.
3. When nesting like this: ``PrefetchDataZMQ(PrefetchDataZMQ(df, nr_proc=a), nr_proc=b)``.
A total of ``a * b`` instances of ``df`` worker processes will be created.
Also in this case, some zmq pipes cannot be cleaned at exit.
4. A local directory is needed to put the ZMQ pipes.
You can set this with env var ``$TENSORPACK_PIPEDIR`` if you're
running on certain non-local FS that may not support pipes, such as NFS or GlusterFS.
""" """
def __init__(self, ds, nr_proc=1, hwm=50): def __init__(self, ds, nr_proc=1, hwm=50):
""" """
...@@ -186,12 +193,12 @@ class PrefetchDataZMQ(ProxyDataFlow): ...@@ -186,12 +193,12 @@ class PrefetchDataZMQ(ProxyDataFlow):
self.procs = [PrefetchProcessZMQ(self.ds, self.pipename, self._hwm) self.procs = [PrefetchProcessZMQ(self.ds, self.pipename, self._hwm)
for _ in range(self.nr_proc)] for _ in range(self.nr_proc)]
self.start_processes() self._start_processes()
# __del__ not guranteed to get called at exit # __del__ not guranteed to get called at exit
import atexit import atexit
atexit.register(lambda x: x.__del__(), self) atexit.register(lambda x: x.__del__(), self)
def start_processes(self): def _start_processes(self):
start_proc_mask_signal(self.procs) start_proc_mask_signal(self.procs)
def __del__(self): def __del__(self):
...@@ -224,7 +231,7 @@ class PrefetchOnGPUs(PrefetchDataZMQ): ...@@ -224,7 +231,7 @@ class PrefetchOnGPUs(PrefetchDataZMQ):
self.gpus = gpus self.gpus = gpus
super(PrefetchOnGPUs, self).__init__(ds, len(gpus)) super(PrefetchOnGPUs, self).__init__(ds, len(gpus))
def start_processes(self): def _start_processes(self):
with mask_sigint(): with mask_sigint():
for gpu, proc in zip(self.gpus, self.procs): for gpu, proc in zip(self.gpus, self.procs):
with change_gpu(gpu): with change_gpu(gpu):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment