Commit 117fb29f authored by Yuxin Wu's avatar Yuxin Wu

misc updates

parent 6b18f4c5
......@@ -135,7 +135,6 @@ class Model(ModelDesc):
SummaryGradient()]
def predictor(self, state):
# TODO use multitower predictor to speed up training
return self.predict_value.eval(feed_dict={'state:0': [state]})[0]
def get_config():
......
......@@ -24,9 +24,9 @@ pip install --user -r tensorpack/requirements.txt
export PYTHONPATH=$PYTHONPATH:`readlink -f tensorpack`
```
+ To perform training, you'll also need [pyzmq](https://github.com/zeromq/pyzmq):
+ To perform training, you'll also need [pyzmq](https://github.com/zeromq/pyzmq) and [scipy](https://www.scipy.org/):
```
pip install --user pyzmq
pip install --user pyzmq scipy
```
+ Pretrained model is hosted at [google drive](https://drive.google.com/open?id=0B308TeQzmFDLa0xOeVQwcXg1ZjQ)
......@@ -56,8 +56,8 @@ To eval on ILSVRC12, `path/to/ILSVRC12` must have a subdirectory named 'val' con
Please use [github issues](https://github.com/ppwwyyxx/tensorpack/issues) for any issues related to the code.
Send email to the authors for other questions related to the paper.
Note that although the model uses low bitwidth weights, activations and gradients, those numbers in
this script are still represented in `tf.float32`. We're not releasing our run-time kernel to speed up.
Note that although the it uses low bitwidth weights, activations and gradients, these values
here are still represented in `tf.float32`, since TensorFlow doesn't natively support low bitwidth computation.
## Citation
......
......@@ -21,7 +21,8 @@ The original experiements are performed on a proprietary framework.
This is our attempt to reproduce it on tensorpack.
This config, with (W,A,G)=(1,1,4), can reach 3.1~3.2% error after 150 epochs.
With the GaussianDeform augmentor, it will reach 2.8~2.9%.
With the GaussianDeform augmentor, it will reach 2.8~2.9%
(we are not using this augmentor in the paper).
"""
BITW = 1
......@@ -65,13 +66,13 @@ def get_dorefa(bitW, bitA, bitG):
x = tf.clip_by_value(x, 0.0, 1.0)
x = quantize(x, bitG) - 0.5
return x * maxx * 2
GRAD_DEFINED = True
def fg(x):
if bitG == 32:
return x
with G.gradient_override_map({"Identity": "FGGrad"}):
return tf.identity(x)
GRAD_DEFINED = True
return fw, fa, fg
class Model(ModelDesc):
......
......@@ -39,17 +39,13 @@ class Model(ModelDesc):
l = tf.nn.lrn(l, 2, bias=1.0, alpha=2e-5, beta=0.75, name='norm1')
l = MaxPooling('pool1', l, 3, stride=2, padding='VALID')
l = Conv2D('conv2', l, out_channel=256, kernel_shape=5,
padding='SAME', split=2)
l = Conv2D('conv2', l, out_channel=256, kernel_shape=5, split=2)
l = tf.nn.lrn(l, 2, bias=1.0, alpha=2e-5, beta=0.75, name='norm2')
l = MaxPooling('pool2', l, 3, stride=2, padding='VALID')
l = Conv2D('conv3', l, out_channel=384, kernel_shape=3,
padding='SAME')
l = Conv2D('conv4', l, out_channel=384, kernel_shape=3,
padding='SAME', split=2)
l = Conv2D('conv5', l, out_channel=256, kernel_shape=3,
padding='SAME', split=2)
l = Conv2D('conv3', l, out_channel=384, kernel_shape=3)
l = Conv2D('conv4', l, out_channel=384, kernel_shape=3, split=2)
l = Conv2D('conv5', l, out_channel=256, kernel_shape=3, split=2)
l = MaxPooling('pool3', l, 3, stride=2, padding='VALID')
l = FullyConnected('fc6', l, 4096)
......
......@@ -28,17 +28,17 @@ class Model(ModelDesc):
image = image / 128.0 - 1
logits = LinearWrap(image) \
.Conv2D('conv1', 24, 5, padding='VALID') \
.MaxPooling('pool1', 2, padding='SAME') \
.Conv2D('conv2', 32, 3, padding='VALID') \
.Conv2D('conv3', 32, 3, padding='VALID') \
.MaxPooling('pool2', 2, padding='SAME') \
.Conv2D('conv4', 64, 3, padding='VALID') \
.tf.nn.dropout(keep_prob) \
logits = (LinearWrap(image)
.Conv2D('conv1', 24, 5, padding='VALID')
.MaxPooling('pool1', 2, padding='SAME')
.Conv2D('conv2', 32, 3, padding='VALID')
.Conv2D('conv3', 32, 3, padding='VALID')
.MaxPooling('pool2', 2, padding='SAME')
.Conv2D('conv4', 64, 3, padding='VALID')
.tf.nn.dropout(keep_prob)
.FullyConnected('fc0', 512,
b_init=tf.constant_initializer(0.1)) \
.FullyConnected('linear', out_dim=10, nl=tf.identity)()
b_init=tf.constant_initializer(0.1))
.FullyConnected('linear', out_dim=10, nl=tf.identity)())
prob = tf.nn.softmax(logits, name='output')
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label)
......
......@@ -30,6 +30,7 @@ class LinearWrap(object):
def __init__(self, mod, tensor):
self._mod = mod
self._t = tensor
def __getattr__(self, name):
ret = getattr(self._mod, name)
if isinstance(ret, ModuleType):
......@@ -66,4 +67,7 @@ class LinearWrap(object):
def __call__(self):
return self._t
def tensor(self):
return self._t
......@@ -16,16 +16,20 @@ def Maxout(x, num_unit):
"""
Maxout networks as in `Maxout Networks <http://arxiv.org/abs/1302.4389>`_.
:param input: a NHWC tensor.
:param input: a NHWC or NC tensor.
:param num_unit: a int. must be divisible by C.
:returns: a NHW(C/num_unit) tensor
"""
input_shape = x.get_shape().as_list()
assert len(input_shape) == 4
ch = input_shape[3]
assert ch % num_unit == 0
x = tf.reshape(x, [-1, input_shape[1], input_shape[2], ch / num_unit, num_unit])
return tf.reduce_max(x, 4, name='output')
ndim = len(input_shape)
assert ndim == 4 or ndim == 2
ch = input_shape[-1]
assert ch is not None and ch % num_unit == 0
if ndim == 4:
x = tf.reshape(x, [-1, input_shape[1], input_shape[2], ch / num_unit, num_unit])
else:
x = tf.reshape(x, [-1, ch / num_unit, num_unit])
return tf.reduce_max(x, ndim, name='output')
@layer_register(log_shape=False)
def PReLU(x, init=tf.constant_initializer(0.001), name=None):
......
......@@ -117,7 +117,7 @@ def FixedUnPooling(x, shape, unpool_mat=None):
@layer_register()
def BilinearUpSample(x, shape):
"""
Bilinear upsample the input images.
Non-parametric bilinear upsample the input images.
:param x: input NHWC tensor
:param shape: an integer
"""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment