results of TIMIT

99362bfd · Yuxin Wu · 37a0f153 · 99362bfd · 99362bfd · 99362bfd
Commit 99362bfd authored Dec 13, 2016 by Yuxin Wu
7 changed files
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ They're not only for demonstration of the framework -- you can train them and re
 + [InceptionV3 on ImageNet](examples/Inception/inceptionv3.py)
 + [Fully-convolutional Network for Holistically-Nested Edge Detection(HED)](examples/HED)
 + [Spatial Transformer Network on MNIST addition](examples/SpatialTransformer)
-+ [Generative Adversarial Network(GAN) variants (DCGAN,Image2Image,InfoGAN)](examples/GAN)
+ [Generative Adversarial Network(GAN) variants, including DCGAN, Image2Image, InfoGAN](examples/GAN)
 + [Deep Q-Network(DQN) variants on Atari games](examples/Atari2600)
 + [Asynchronous Advantage Actor-Critic(A3C) with demos on OpenAI Gym](examples/OpenAIGym)
 + [LSTM-CTC for speech recognition](examples/TIMIT)

--- a/examples/GAN/DCGAN-CelebA.py
+++ b/examples/GAN/DCGAN-CelebA.py
@@ -38,28 +38,30 @@ class Model(ModelDesc):
    def generator(self, z):
        """ return a image generated from z"""
-        l = FullyConnected('fc0', z, 64 * 8 * 4 * 4, nl=tf.identity)
+        nf = 64
-        l = tf.reshape(l, [-1, 4, 4, 64*8])
+        l = FullyConnected('fc0', z, nf * 8 * 4 * 4, nl=tf.identity)
+        l = tf.reshape(l, [-1, 4, 4, nf*8])
        l = BNReLU(l)
        with argscope(Deconv2D, nl=BNReLU, kernel_shape=4, stride=2):
-            l = Deconv2D('deconv1', l, [8, 8, 64 * 4])
+            l = Deconv2D('deconv1', l, [8, 8, nf * 4])
-            l = Deconv2D('deconv2', l, [16, 16, 64 * 2])
+            l = Deconv2D('deconv2', l, [16, 16, nf * 2])
-            l = Deconv2D('deconv3', l, [32, 32, 64])
+            l = Deconv2D('deconv3', l, [32, 32, nf])
            l = Deconv2D('deconv4', l, [64, 64, 3], nl=tf.identity)
            l = tf.tanh(l, name='gen')
        return l
    def discriminator(self, imgs):
        """ return a (b, 1) logits"""
+        nf = 64
        with argscope(Conv2D, nl=tf.identity, kernel_shape=4, stride=2), \
                argscope(LeakyReLU, alpha=0.2):
            l = (LinearWrap(imgs)
-                .Conv2D('conv0', 64, nl=LeakyReLU)
+                .Conv2D('conv0', nf, nl=LeakyReLU)
-                .Conv2D('conv1', 64*2)
+                .Conv2D('conv1', nf*2)
                .BatchNorm('bn1').LeakyReLU()
-                .Conv2D('conv2', 64*4)
+                .Conv2D('conv2', nf*4)
                .BatchNorm('bn2').LeakyReLU()
-                .Conv2D('conv3', 64*8)
+                .Conv2D('conv3', nf*8)
                .BatchNorm('bn3').LeakyReLU()
                .FullyConnected('fct', 1, nl=tf.identity)())
        return l
@@ -125,24 +127,6 @@ def sample(model_path):
        o = o[:,:,:,::-1]
        viz = next(build_patch_list(o, nr_row=10, nr_col=10, viz=True))
-#def vec(model_path):
-    #func = OfflinePredictor(PredictConfig(
-       #session_init=get_model_loader(model_path),
-       #model=Model(),
-       #input_names=['z'],
-       #output_names=['gen/gen']))
-    #dic = np.load('demo/CelebA-vec.npy').item()
-    #assert np.all(
-            #dic['w_smile'] - dic['w_neutral'] \
-                    #+ dic['m_neutral'] == dic['m_smile'])
-    #imgs = []
-    #for z in ['w_neutral', 'w_smile', 'm_neutral', 'm_smile']:
-        #z = dic[z]
-        #img = func([[z]])[0][0][:,:,::-1]
-        #img = (img + 1) * 128
-        #imgs.append(img)
-    #viz = next(build_patch_list(imgs, nr_row=1, nr_col=4, viz=True))
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')

--- a/examples/README.md
+++ b/examples/README.md
@@ -19,7 +19,7 @@ Training examples with __reproducible__ and meaningful performance.
 + [Asynchronous Advantage Actor-Critic(A3C) with demos on OpenAI Gym](OpenAIGym)
 ## Unsupervised:
-+ [Generative Adversarial Network(GAN) variants (DCGAN,Image2Image,InfoGAN)](examples/GAN)
+ [Generative Adversarial Network(GAN) variants, including DCGAN, Image2Image, InfoGAN](examples/GAN)
 ## Speech / NLP:
 + [LSTM-CTC for speech recognition](TIMIT)

--- a/examples/TIMIT/README.md
+++ b/examples/TIMIT/README.md
@@ -15,7 +15,10 @@ cd /PATH/TO/TIMIT
 find -name '*.WAV' | parallel -P20 sox {} '{.}.wav'
 ```
-Extract MFCC features and save everything to LMDB database:
+Extract MFCC features and phoneme labels, and save everything to LMDB database. The preprocessing
+follows the setup in
+ Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with RNN - Alex Graves
 ```
 ./create-lmdb.py build --dataset /PATH/TO/TIMIT/TRAIN --db train.mdb
 ./create-lmdb.py build --dataset /PATH/TO/TIMIT/TEST --db test.mdb
@@ -32,4 +35,4 @@ Compute mean/std of the training set (and save to `stats.data` by default):
 ```
 ### Results:
-To be done
+Get 0.28 LER (normalized edit distance) after about 40 epochs.
--- a/examples/TIMIT/train-timit.py
+++ b/examples/TIMIT/train-timit.py
@@ -57,8 +57,15 @@ class Model(ModelDesc):
        self.cost = tf.reduce_mean(loss, name='cost')
        logits = tf.transpose(logits, [1,0,2])
-        predictions = tf.to_int32(
-                tf.nn.ctc_beam_search_decoder(logits, seqlen)[0][0])
+        isTrain = get_current_tower_context().is_training
+        if isTrain:
+            # beam search is too slow to run in training
+            predictions = tf.to_int32(
+                    tf.nn.ctc_greedy_decoder(logits, seqlen)[0][0])
+        else:
+            predictions = tf.to_int32(
+                    tf.nn.ctc_beam_search_decoder(logits, seqlen)[0][0])
        err = tf.edit_distance(predictions, label, normalize=True)
        err.set_shape([None])
        err = tf.reduce_mean(err, name='error')
@@ -94,7 +101,7 @@ def get_config(ds_train, ds_test):
        ]),
        model=Model(),
        step_per_epoch=step_per_epoch,
-        max_epoch=300,
+        max_epoch=70,
    )
 if __name__ == '__main__':
@@ -104,7 +111,7 @@ if __name__ == '__main__':
    parser.add_argument('--train', help='path to training lmdb', required=True)
    parser.add_argument('--test', help='path to testing lmdb', required=True)
    parser.add_argument('--stat', help='path to the mean/std statistics file',
-            default='stats.data')
+                        default='stats.data')
    args = parser.parse_args()
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

--- a/tensorpack/tfutils/summary.py
+++ b/tensorpack/tfutils/summary.py
@@ -115,7 +115,7 @@ def summary_moving_average(tensors=None):
    # TODO will produce tower0/xxx. not elegant
    with tf.name_scope(None):
        averager = tf.train.ExponentialMovingAverage(
-            0.90, num_updates=get_global_step_var(), name='EMA')
+            0.95, num_updates=get_global_step_var(), name='EMA')
    avg_maintain_op = averager.apply(tensors)
    for idx, c in enumerate(tensors):
        name = re.sub('tower[p0-9]+/', '', c.op.name)

--- a/tensorpack/train/queue.py
+++ b/tensorpack/train/queue.py
@@ -10,7 +10,7 @@ from ..tfutils import get_global_step_var
 from ..tfutils.tower import TowerContext
 from ..tfutils.gradproc import apply_grad_processors
 from ..tfutils.summary import summary_moving_average
-from .input_data import QueueInput
+from .input_data import QueueInput, FeedfreeInput
 from .trainer import (MultiPredictorTowerTrainer, SingleCostFeedfreeTrainer)
@@ -25,6 +25,7 @@ class SimpleFeedfreeTrainer(
        config.data must exists
        """
        self._input_method = config.data
+        assert isinstance(self._input_method, FeedfreeInput), self._input_method
        super(SimpleFeedfreeTrainer, self).__init__(config)
        self._setup_predictor_factory(predict_tower)
        assert len(self.config.tower) == 1, \