sphinx docs for models/

edecca96 · Yuxin Wu · b335a7ba · edecca96 · edecca96 · edecca96
Commit edecca96 authored Jan 04, 2017 by Yuxin Wu
14 changed files
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -36,6 +36,7 @@ import mock
 MOCK_MODULES = ['scipy',
                'tensorflow', 'tensorflow.contrib',
+                'tensorflow.python.ops',
                'tensorflow.contrib.framework',
                'tensorflow.models',
                'tensorflow.models.rnn',
@@ -64,10 +65,15 @@ from tensorpack.models import *
 # ones.
 extensions = [
    'sphinx.ext.autodoc',
-    'sphinx.ext.coverage',
+    'sphinx.ext.napoleon',
-    'sphinx.ext.mathjax',
+    #'sphinx.ext.coverage',
+    #'sphinx.ext.mathjax',
+    'sphinx.ext.mathbase',
    'sphinx.ext.viewcode',
 ]
+napoleon_google_docstring = True
+napoleon_numpy_docstring = False
+napoleon_use_rtype = False
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']

--- a/docs/update.sh
+++ b/docs/update.sh
@@ -2,6 +2,11 @@
 # File: update.sh
 # Author: Yuxin Wu <ppwwyyxxc@gmail.com>
+PROG_NAME=`readlink -f $0`
+PROG_DIR=`dirname "$PROG_NAME"`
+cd "$PROG_DIR"
 make clean
 #sphinx-apidoc -o modules ../tensorpack -f -d 10
 make html
--- a/examples/DoReFa-Net/alexnet-dorefa.py
+++ b/examples/DoReFa-Net/alexnet-dorefa.py
@@ -69,7 +69,7 @@ BITW = 1
 BITA = 2
 BITG = 6
 TOTAL_BATCH_SIZE = 128
-BATCH_SIZE = 64
+BATCH_SIZE = None
 class Model(ModelDesc):

--- a/tensorpack/models/__init__.py
+++ b/tensorpack/models/__init__.py
@@ -7,7 +7,7 @@ from types import ModuleType
 import six
 import os
 import os.path
-# this line is necessary for TFModuleFunc to work
+# this line is necessary for _TFModuleFunc to work
 import tensorflow as tf  # noqa: F401
 from ..utils import logger
@@ -34,8 +34,7 @@ class LinearWrap(object):
        consisting of layers / symbolic functions with only one input & output.
    """
-    class TFModuleFunc(object):
+    class _TFModuleFunc(object):
        def __init__(self, mod, tensor):
            self._mod = mod
            self._t = tensor
@@ -43,7 +42,7 @@ class LinearWrap(object):
        def __getattr__(self, name):
            ret = getattr(self._mod, name)
            if isinstance(ret, ModuleType):
-                return LinearWrap.TFModuleFunc(ret, self._t)
+                return LinearWrap._TFModuleFunc(ret, self._t)
            else:
                # assume to be a tf function
                def f(*args, **kwargs):
@@ -52,6 +51,10 @@ class LinearWrap(object):
                return f
    def __init__(self, tensor):
+        """
+        Args:
+            tensor (tf.Tensor): the tensor to wrap
+        """
        self._t = tensor
    def __getattr__(self, layer_name):
@@ -76,10 +79,15 @@ class LinearWrap(object):
            if layer_name != 'tf':
                logger.warn("You're calling LinearWrap.__getattr__ with something neither a layer nor 'tf'!")
            assert isinstance(layer, ModuleType)
-            return LinearWrap.TFModuleFunc(layer, self._t)
+            return LinearWrap._TFModuleFunc(layer, self._t)
    def apply(self, func, *args, **kwargs):
-        """ send tensor to the first argument of a simple func"""
+        """
+        Apply a function on the wrapped tensor.
+        Returns:
+            LinearWrap: ``LinearWrap(func(self.tensor(), *args, **kwargs))``.
+        """
        ret = func(self._t, *args, **kwargs)
        return LinearWrap(ret)
@@ -87,8 +95,20 @@ class LinearWrap(object):
        return self._t
    def tensor(self):
+        """
+        Equivalent to ``self.__call__()``.
+        Returns:
+            tf.Tensor: the underlying wrapped tensor.
+        """
        return self._t
    def print_tensor(self):
+        """
+        Print the underlying tensor and return self. Can be useful to get the
+        name of tensors inside :class:`LinearWrap`.
+        :return: self
+        """
        print(self._t)
        return self
--- a/tensorpack/models/batch_norm.py
+++ b/tensorpack/models/batch_norm.py
@@ -20,21 +20,6 @@ __all__ = ['BatchNorm', 'BatchNormV1', 'BatchNormV2']
 @layer_register(log_shape=False)
 def BatchNormV1(x, use_local_stat=None, decay=0.9, epsilon=1e-5):
-    """
-    Batch normalization layer as described in:
-    `Batch Normalization: Accelerating Deep Network Training by
-    Reducing Internal Covariance Shift <http://arxiv.org/abs/1502.03167>`_.
-    :param input: a NHWC or NC tensor
-    :param use_local_stat: bool. whether to use mean/var of this batch or the moving average.
-        Default to True in training and False in inference.
-    :param decay: decay rate. default to 0.9.
-    :param epsilon: default to 1e-5.
-    Note that only the first training tower maintains a moving average.
-    """
    shape = x.get_shape().as_list()
    assert len(shape) in [2, 4]
@@ -114,18 +99,8 @@ def BatchNormV1(x, use_local_stat=None, decay=0.9, epsilon=1e-5):
 @layer_register(log_shape=False)
 def BatchNormV2(x, use_local_stat=None, decay=0.9, epsilon=1e-5):
    """
-    Batch normalization layer as described in:
+    A slightly faster but equivalent version of BatchNormV1, which uses
+    ``fused_batch_norm`` in training.
-    `Batch Normalization: Accelerating Deep Network Training by
-    Reducing Internal Covariance Shift <http://arxiv.org/abs/1502.03167>`_.
-    :param input: a NHWC or NC tensor
-    :param use_local_stat: bool. whether to use mean/var of this batch or the moving average.
-        Default to True in training and False in inference.
-    :param decay: decay rate. default to 0.9.
-    :param epsilon: default to 1e-5.
-    Note that only the first training tower maintains a moving average.
    """
    shape = x.get_shape().as_list()
    assert len(shape) in [2, 4]
@@ -185,8 +160,27 @@ def BatchNormV2(x, use_local_stat=None, decay=0.9, epsilon=1e-5):
        return tf.identity(xn, name='output')
-if get_tf_version() >= 12:
+def BatchNorm(*args, **kwargs):
-    BatchNorm = BatchNormV2
+    """
-else:
+    Batch normalization layer, as described in the paper:
-    logger.warn("BatchNorm might be faster if you update TensorFlow")
+    `Batch Normalization: Accelerating Deep Network Training by
-    BatchNorm = BatchNormV1
+    Reducing Internal Covariance Shift <http://arxiv.org/abs/1502.03167>`_.
+    Args:
+        x (tf.Tensor): a NHWC or NC tensor.
+        use_local_stat (bool): whether to use mean/var of the current batch or the moving average.
+            Defaults to True in training and False in inference.
+        decay (float): decay rate of moving average.
+        epsilon (float): epsilon to avoid divide-by-zero.
+    Note:
+        * In multi-tower training, only the first training tower maintains a moving average.
+        * It automatically selects :meth:`BatchNormV1` or :meth:`BatchNormV2`
+          according to availability.
+    """
+    if get_tf_version() >= 12:
+        return BatchNormV2(*args, **kwargs)
+    else:
+        logger.warn("BatchNorm might be faster if you update TensorFlow")
+        return BatchNormV1(*args, **kwargs)
--- a/tensorpack/models/conv2d.py
+++ b/tensorpack/models/conv2d.py
@@ -5,7 +5,6 @@
 import tensorflow as tf
 from ._common import layer_register, shape2d, shape4d
-from ..utils import logger
 __all__ = ['Conv2D', 'Deconv2D']
@@ -14,21 +13,22 @@ __all__ = ['Conv2D', 'Deconv2D']
 def Conv2D(x, out_channel, kernel_shape,
           padding='SAME', stride=1,
           W_init=None, b_init=None,
-           nl=None, split=1, use_bias=True):
+           nl=tf.identity, split=1, use_bias=True):
    """
    2D convolution on 4D inputs.
-    :param input: a tensor of shape NHWC
+    Args:
-    :param out_channel: number of output channel
+        x (tf.Tensor): a tensor of shape NHWC.
-    :param kernel_shape: (h, w) or a int
+            Must have known number of channels, but can have other unknown dimensions.
-    :param stride: (h, w) or a int. default to 1
+        out_channel (int): number of output channel.
-    :param padding: 'valid' or 'same'. default to 'same'
+        kernel_shape: (h, w) tuple or a int.
-    :param split: split channels as used in Alexnet. Default to 1 (no split)
+        stride: (h, w) tuple or a int.
-    :param W_init: initializer for W. default to `xavier_initializer_conv2d`.
+        padding (str): 'valid' or 'same'. Case insensitive.
-    :param b_init: initializer for b. default to zero initializer.
+        split (int): Split channels as used in Alexnet. Defaults to 1 (no split).
-    :param nl: nonlinearity
+        W_init: initializer for W. Defaults to `variance_scaling_initializer`.
-    :param use_bias: whether to use bias. a boolean default to True
+        b_init: initializer for b. Defaults to zero.
-    :returns: a NHWC tensor
+        nl: a nonlinearity function.
+        use_bias (bool): whether to use bias.
    """
    in_shape = x.get_shape().as_list()
    in_channel = in_shape[-1]
@@ -53,22 +53,15 @@ def Conv2D(x, out_channel, kernel_shape,
    if split == 1:
        conv = tf.nn.conv2d(x, W, stride, padding)
    else:
-        # TODO rename to split later
        inputs = tf.split(x, split, 3)
        kernels = tf.split(W, split, 3)
        outputs = [tf.nn.conv2d(i, k, stride, padding)
                   for i, k in zip(inputs, kernels)]
        conv = tf.concat_v2(outputs, 3)
-    if nl is None:
-        logger.warn(
-            "[DEPRECATED] Default ReLU nonlinearity for Conv2D and FullyConnected will be deprecated. "
-            "Please use argscope instead.")
-        nl = tf.nn.relu
    return nl(tf.nn.bias_add(conv, b) if use_bias else conv, name='output')
 class StaticDynamicShape(object):
    def __init__(self, static, dynamic):
        self.static = static
        self.dynamic = dynamic
@@ -89,17 +82,18 @@ def Deconv2D(x, out_shape, kernel_shape,
    """
    2D deconvolution on 4D inputs.
-    :param input: a tensor of shape NHWC
+    Args:
-    :param out_shape: either (h, w, channel), or just channel,
+        x (tf.Tensor): a tensor of shape NHWC.
-        then h, w will calculated by input_shape * stride
+            Must have known number of channels, but can have other unknown dimensions.
-    :param kernel_shape: (h, w) or a int
+        out_shape: (h, w, channel) tuple, or just a integer channel,
-    :param stride: (h, w) or a int
+            then (h, w) will be calculated by input_shape * stride
-    :param padding: 'valid' or 'same'. default to 'same'
+        kernel_shape: (h, w) tuple or a int.
-    :param W_init: initializer for W. default to `xavier_initializer_conv2d`.
+        stride: (h, w) tuple or a int.
-    :param b_init: initializer for b. default to zero initializer.
+        padding (str): 'valid' or 'same'. Case insensitive.
-    :param nl: nonlinearity.
+        W_init: initializer for W. Defaults to `variance_scaling_initializer`.
-    :param use_bias: whether to use bias. a boolean default to True
+        b_init: initializer for b. Defaults to zero.
-    :returns: a NHWC tensor
+        nl: a nonlinearity function.
+        use_bias (bool): whether to use bias.
    """
    in_shape = x.get_shape().as_list()[1:]
    in_channel = in_shape[-1]

--- a/tensorpack/models/fc.py
+++ b/tensorpack/models/fc.py
@@ -7,7 +7,6 @@ import tensorflow as tf
 from ._common import layer_register
 from ..tfutils import symbolic_functions as symbf
-from ..utils import logger
 __all__ = ['FullyConnected']
@@ -15,17 +14,17 @@ __all__ = ['FullyConnected']
 @layer_register()
 def FullyConnected(x, out_dim,
                   W_init=None, b_init=None,
-                   nl=None, use_bias=True):
+                   nl=tf.identity, use_bias=True):
    """
-    Fully-Connected layer.
+    Fully-Connected layer. Takes a N>1D tensor and returns a 2D tensor.
-    :param input: a tensor to be flattened except the first dimension.
+    Args:
-    :param out_dim: output dimension
+        x (tf.Tensor): a tensor to be flattened except for the first dimension.
-    :param W_init: initializer for W. default to `xavier_initializer_conv2d`.
+        out_dim (int): output dimension
-    :param b_init: initializer for b. default to zero initializer.
+        W_init: initializer for W. Defaults to `variance_scaling_initializer`.
-    :param nl: nonlinearity
+        b_init: initializer for b. Defaults to zero.
-    :param use_bias: whether to use bias. a boolean default to True
+        nl: a nonlinearity function
-    :returns: a 2D tensor
+        use_bias (bool): whether to use bias.
    """
    x = symbf.batch_flatten(x)
    in_dim = x.get_shape().as_list()[1]
@@ -39,9 +38,4 @@ def FullyConnected(x, out_dim,
    if use_bias:
        b = tf.get_variable('b', [out_dim], initializer=b_init)
    prod = tf.nn.xw_plus_b(x, W, b) if use_bias else tf.matmul(x, W)
-    if nl is None:
-        logger.warn(
-            "[DEPRECATED] Default ReLU nonlinearity for Conv2D and FullyConnected will be deprecated."
-            " Please use argscope instead.")
-        nl = tf.nn.relu
    return nl(prod, name='output')
--- a/tensorpack/models/image_sample.py
+++ b/tensorpack/models/image_sample.py
@@ -48,15 +48,18 @@ def sample(img, coords):
 @layer_register()
 def ImageSample(inputs, borderMode='repeat'):
    """
-    Sample the template image, using the given coordinate, by bilinear interpolation.
+    Sample the template image using the given coordinate, by bilinear interpolation.
-    It mimics the same behavior described in:
+    This was described in the paper:
    `Spatial Transformer Networks <http://arxiv.org/abs/1506.02025>`_.
-    :param input: [template, mapping]. template of shape NHWC.
+    Args:
-        mapping of shape NHW2, where each pair of the last dimension is a (y, x) real-value
+        inputs (list): [template, coords]. template has shape NHWC.
-        coordinate.
+            coords has shape (N,H',W',2), where each pair of the last dimension is a (y, x) real-value
-    :param borderMode: either 'repeat' or 'constant' (0)
+            coordinate.
-    :returns: a NHWC output tensor.
+        borderMode: either "repeat" or "constant" (zero-filled)
+    Returns:
+        a (N,H',W',C) tensor.
    """
    # TODO borderValue
    template, mapping = inputs

--- a/tensorpack/models/model_desc.py
+++ b/tensorpack/models/model_desc.py
@@ -17,7 +17,15 @@ __all__ = ['ModelDesc', 'InputVar', 'ModelFromMetaGraph']
 class InputVar(object):
+    """ Store metadata about input placeholders. """
    def __init__(self, type, shape, name, sparse=False):
+        """
+        Args:
+            type: tf type of the tensor.
+            shape (list):
+            name (str):
+            sparse (bool): whether to use ``tf.sparse_placeholder``.
+        """
        self.type = type
        self.shape = shape
        self.name = name
@@ -39,7 +47,8 @@ class ModelDesc(object):
        """
        Create or return (if already created) raw input TF placeholder vars in the graph.
-        :returns: the list of raw input vars in the graph
+        Returns:
+            list[tf.Tensor]: the list of input placeholders in the graph.
        """
        if hasattr(self, 'reuse_input_vars'):
            return self.reuse_input_vars
@@ -51,7 +60,12 @@ class ModelDesc(object):
    get_reuse_placehdrs = get_input_vars
    def build_placeholders(self, prefix=''):
-        """ build placeholders with optional prefix, for each InputVar
+        """
+        For each InputVar, create new placeholders with optional prefix and
+        return them. Useful when building new towers.
+        Returns:
+            list[tf.Tensor]: the list of built placeholders.
        """
        input_vars = self._get_input_vars()
        for v in input_vars:
@@ -65,20 +79,25 @@ class ModelDesc(object):
        return ret
    def get_input_vars_desc(self):
-        """ return a list of `InputVar` instance"""
+        """
+        Returns:
+            list[:class:`InputVar`]: list of the underlying :class:`InputVar`.
+        """
        return self._get_input_vars()
    @abstractmethod
    def _get_input_vars(self):
-        """:returns: a list of InputVar """
+        """
+        :returns: a list of InputVar
+        """
    def build_graph(self, model_inputs):
        """
-        Setup the whole graph.
+        Build the whole symbolic graph.
-        :param model_inputs: a list of input variable in the graph.
+        Args:
-        :param is_training: a boolean
+            model_inputs (list[tf.Tensor]): a list of inputs, corresponding to
-        :returns: the cost to minimize. a scalar variable
+                InputVars of this model.
        """
        if len(inspect.getargspec(self._build_graph).args) == 3:
            logger.warn("[DEPRECATED] _build_graph(self, input_vars, is_training) is deprecated! \
@@ -92,13 +111,19 @@ Use _build_graph(self, input_vars) and get_current_tower_context().is_training i
        pass
    def get_cost(self):
+        """
+        Return the cost tensor in the graph. Called by some of the :class:`tensorpack.train.Trainer` which
+        assumes single-cost models.
+        """
        return self._get_cost()
    def _get_cost(self, *args):
        return self.cost
    def get_gradient_processor(self):
-        """ Return a list of GradientProcessor. They will be executed in order"""
+        """ Return a list of :class:`tensorpack.tfutils.GradientProcessor`.
+            They will be executed by the trainer in the given order.
+        """
        return [  # SummaryGradient(),
            CheckGradient()
        ]
@@ -106,11 +131,15 @@ Use _build_graph(self, input_vars) and get_current_tower_context().is_training i
 class ModelFromMetaGraph(ModelDesc):
    """
-    Load the whole exact TF graph from a saved meta_graph.
+    Load the exact TF graph from a saved meta_graph.
    Only useful for inference.
    """
    def __init__(self, filename):
+        """
+        Args:
+            filename(str): file name of the saved meta graph.
+        """
        tf.train.import_meta_graph(filename)
        all_coll = tf.get_default_graph().get_all_collection_keys()
        for k in [INPUT_VARS_KEY, tf.GraphKeys.TRAINABLE_VARIABLES,

--- a/tensorpack/models/nonlin.py
+++ b/tensorpack/models/nonlin.py
@@ -14,11 +14,14 @@ __all__ = ['Maxout', 'PReLU', 'LeakyReLU', 'BNReLU']
 @layer_register()
 def Maxout(x, num_unit):
    """
-    Maxout as in `Maxout Networks <http://arxiv.org/abs/1302.4389>`_.
+    Maxout as in the paper `Maxout Networks <http://arxiv.org/abs/1302.4389>`_.
-    :param input: a NHWC or NC tensor.
+    Args:
-    :param num_unit: a int. must be divisible by C.
+        x (tf.Tensor): a NHWC or NC tensor. Channel has to be known.
-    :returns: a NHW(C/num_unit) tensor
+        num_unit (int): a int. Must be divisible by C.
+    Returns:
+        tf.Tensor: of shape NHW(C/num_unit).
    """
    input_shape = x.get_shape().as_list()
    ndim = len(input_shape)
@@ -33,42 +36,42 @@ def Maxout(x, num_unit):
 @layer_register(log_shape=False)
-def PReLU(x, init=tf.constant_initializer(0.001), name=None):
+def PReLU(x, init=0.001, name='output'):
    """
-    Parameterized relu as in `Delving Deep into Rectifiers: Surpassing
+    Parameterized ReLU as in the paper `Delving Deep into Rectifiers: Surpassing
    Human-Level Performance on ImageNet Classification
    <http://arxiv.org/abs/1502.01852>`_.
-    :param input: any tensor.
+    Args:
-    :param init: initializer for the p. default to 0.001.
+        x (tf.Tensor): input
+        init (float): initial value for the learnable slope.
+        name (str): name of the output.
    """
+    init = tf.constant_initializer(init)
    alpha = tf.get_variable('alpha', [], initializer=init)
    x = ((1 + alpha) * x + (1 - alpha) * tf.abs(x))
-    if name is None:
-        name = 'output'
    return tf.mul(x, 0.5, name=name)
 @layer_register(use_scope=False, log_shape=False)
-def LeakyReLU(x, alpha, name=None):
+def LeakyReLU(x, alpha, name='output'):
    """
-    Leaky relu as in `Rectifier Nonlinearities Improve Neural Network Acoustic
+    Leaky ReLU as in paper `Rectifier Nonlinearities Improve Neural Network Acoustic
    Models
    <http://ai.stanford.edu/~amaas/papers/relu_hybrid_icml2013_final.pdf>`_.
-    :param input: any tensor.
+    Args:
-    :param alpha: the negative slope.
+        x (tf.Tensor): input
+        alpha (float): the slope.
    """
-    if name is None:
-        name = 'output'
    return tf.maximum(x, alpha * x, name=name)
-    # alpha = float(alpha)
-    # x = ((1 + alpha) * x + (1 - alpha) * tf.abs(x))
-    # return tf.mul(x, 0.5, name=name)
 @layer_register(log_shape=False, use_scope=False)
 def BNReLU(x, name=None):
+    """
+    A shorthand of BatchNormalization + ReLU.
+    """
    x = BatchNorm('bn', x)
    x = tf.nn.relu(x, name=name)
    return x
--- a/tensorpack/models/pool.py
+++ b/tensorpack/models/pool.py
@@ -18,13 +18,13 @@ __all__ = ['MaxPooling', 'FixedUnPooling', 'AvgPooling', 'GlobalAvgPooling',
 @layer_register()
 def MaxPooling(x, shape, stride=None, padding='VALID'):
    """
-    MaxPooling on images.
+    Max Pooling on 4D tensors.
-    :param input: NHWC tensor.
+    Args:
-    :param shape: int or [h, w]
+        x (tf.Tensor): a NHWC tensor.
-    :param stride: int or [h, w]. default to be shape.
+        shape: int or (h, w) tuple
-    :param padding: 'valid' or 'same'. default to 'valid'
+        stride: int or (h, w) tuple. Defaults to be the same as shape.
-    :returns: NHWC tensor.
+        padding (str): 'valid' or 'same'.
    """
    padding = padding.upper()
    shape = shape4d(shape)
@@ -39,13 +39,13 @@ def MaxPooling(x, shape, stride=None, padding='VALID'):
 @layer_register()
 def AvgPooling(x, shape, stride=None, padding='VALID'):
    """
-    Average pooling on images.
+    Average Pooling on 4D tensors.
-    :param input: NHWC tensor.
+    Args:
-    :param shape: int or [h, w]
+        x (tf.Tensor): a NHWC tensor.
-    :param stride: int or [h, w]. default to be shape.
+        shape: int or (h, w) tuple
-    :param padding: 'valid' or 'same'. default to 'valid'
+        stride: int or (h, w) tuple. Defaults to be the same as shape.
-    :returns: NHWC tensor.
+        padding (str): 'valid' or 'same'.
    """
    padding = padding.upper()
    shape = shape4d(shape)
@@ -60,19 +60,20 @@ def AvgPooling(x, shape, stride=None, padding='VALID'):
 @layer_register()
 def GlobalAvgPooling(x):
    """
-    Global average pooling as in `Network In Network
+    Global average pooling as in the paper `Network In Network
    <http://arxiv.org/abs/1312.4400>`_.
-    :param input: NHWC tensor.
+    Args:
-    :returns: NC tensor.
+        x (tf.Tensor): a NHWC tensor.
+    Returns:
+        tf.Tensor: a NC tensor.
    """
    assert x.get_shape().ndims == 4
    return tf.reduce_mean(x, [1, 2])
-# https://github.com/tensorflow/tensorflow/issues/2169
 def UnPooling2x2ZeroFilled(x):
+    # https://github.com/tensorflow/tensorflow/issues/2169
    out = tf.concat_v2([x, tf.zeros_like(x)], 3)
    out = tf.concat_v2([out, tf.zeros_like(out)], 2)
@@ -90,13 +91,13 @@ def UnPooling2x2ZeroFilled(x):
 @layer_register()
 def FixedUnPooling(x, shape, unpool_mat=None):
    """
-    Unpool the input with a fixed mat to perform kronecker product with.
+    Unpool the input with a fixed matrix to perform kronecker product with.
-    :param input: NHWC tensor
+    Args:
-    :param shape: int or [h, w]
+        x (tf.Tensor): a NHWC tensor
-    :param unpool_mat: a tf/np matrix with size=shape. If None, will use a mat
+        shape: int or (h, w) tuple
-        with 1 at top-left corner.
+        unpool_mat: a tf.Tensor or np.ndarray 2D matrix with size=shape.
-    :returns: NHWC tensor
+            If is None, will use a matrix with 1 at top-left corner.
    """
    shape = shape2d(shape)
@@ -129,9 +130,11 @@ def FixedUnPooling(x, shape, unpool_mat=None):
 @layer_register()
 def BilinearUpSample(x, shape):
    """
-    Deterministic bilinear upsample the input images.
+    Deterministic bilinearly-upsample the input images.
-    :param x: input NHWC tensor
-    :param shape: an integer, the upsample factor
+    Args:
+        x (tf.Tensor): a NHWC tensor
+        shape (int): the upsample factor
    """
    # inp_shape = tf.shape(x)
    # return tf.image.resize_bilinear(x,

--- a/tensorpack/models/regularize.py
+++ b/tensorpack/models/regularize.py
@@ -22,11 +22,21 @@ l2_regularizer = tf.contrib.layers.l2_regularizer
 l1_regularizer = tf.contrib.layers.l1_regularizer
-def regularize_cost(regex, func, name=None):
+def regularize_cost(regex, func, name='regularize_cost'):
    """
    Apply a regularizer on every trainable variable matching the regex.
-    :param func: a function that takes a tensor and return a scalar.
+    Args:
+        regex (str): a regex to match variable names, e.g. "conv.*/W"
+        func: the regularization function, which takes a tensor and returns a scalar tensor.
+    Returns:
+        tf.Tensor: the total regularization cost.
+    Example:
+        .. code-block:: python
+            cost = cost + regularize_cost("fc.*/W", l2_regularizer(1e-5))
    """
    G = tf.get_default_graph()
    params = G.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
@@ -45,7 +55,14 @@ def regularize_cost(regex, func, name=None):
 @layer_register(log_shape=False, use_scope=False)
 def Dropout(x, keep_prob=0.5, is_training=None):
    """
-    :param is_training: if None, will use the current context by default.
+    Dropout layer as in the paper `Dropout: a Simple Way to Prevent
+    Neural Networks from Overfitting <http://dl.acm.org/citation.cfm?id=2670313>`_.
+    Args:
+        keep_prob: the probability that each element is kept. It is only used
+            when is_training=True.
+        is_training: If None, will use the current :class:`tensorpack.tfutils.TowerContext`
+            to figure out.
    """
    if is_training is None:
        is_training = get_current_tower_context().is_training

--- a/tensorpack/models/shapes.py
+++ b/tensorpack/models/shapes.py
@@ -12,12 +12,15 @@ __all__ = ['ConcatWith']
 @layer_register(use_scope=False, log_shape=False)
 def ConcatWith(x, dim, tensor):
    """
-    A wrapper around `tf.concat_v2` to support `LinearWrap`
+    A wrapper around ``tf.concat`` to cooperate with :class:`LinearWrap`.
-    :param x: the input tensor
-    :param dim: the dimension along which to concatenate
+    Args:
-    :param tensor: a tensor or list of tensor to concatenate with x.
+        x (tf.Tensor): input
-    x will be at the beginning
+        dim (int): the dimension along which to concatenate
-    :return: tf.concat_v2([x] + [tensor], dim)
+        tensor (list[tf.Tensor]): a tensor or list of tensors to concatenate with x.
+            x will be at the beginning
+    Returns:
+        tf.Tensor: ``tf.concat_v2([x] + tensor, dim)``
    """
    if type(tensor) != list:
        tensor = [tensor]

--- a/tensorpack/models/softmax.py
+++ b/tensorpack/models/softmax.py
@@ -12,8 +12,14 @@ __all__ = ['SoftMax']
 @layer_register()
 def SoftMax(x, use_temperature=False, temperature_init=1.0):
    """
-    A SoftMax layer (no linear projection) with optional temperature
+    A SoftMax layer (w/o linear projection) with optional temperature, as
-    :param x: a 2D tensor
+    defined in the paper `Distilling the Knowledge in a Neural Network
+    <https://arxiv.org/abs/1503.02531>`_.
+    Args:
+        x (tf.Tensor): input
+        use_temperature (bool): use a learnable temperature or not.
+        temperature_init (float): initial value of the temperature.
    """
    if use_temperature:
        t = tf.get_variable('invtemp', [],