Commit 99362bfd authored by Yuxin Wu's avatar Yuxin Wu

results of TIMIT

parent 37a0f153
...@@ -9,7 +9,7 @@ They're not only for demonstration of the framework -- you can train them and re ...@@ -9,7 +9,7 @@ They're not only for demonstration of the framework -- you can train them and re
+ [InceptionV3 on ImageNet](examples/Inception/inceptionv3.py) + [InceptionV3 on ImageNet](examples/Inception/inceptionv3.py)
+ [Fully-convolutional Network for Holistically-Nested Edge Detection(HED)](examples/HED) + [Fully-convolutional Network for Holistically-Nested Edge Detection(HED)](examples/HED)
+ [Spatial Transformer Network on MNIST addition](examples/SpatialTransformer) + [Spatial Transformer Network on MNIST addition](examples/SpatialTransformer)
+ [Generative Adversarial Network(GAN) variants (DCGAN,Image2Image,InfoGAN)](examples/GAN) + [Generative Adversarial Network(GAN) variants, including DCGAN, Image2Image, InfoGAN](examples/GAN)
+ [Deep Q-Network(DQN) variants on Atari games](examples/Atari2600) + [Deep Q-Network(DQN) variants on Atari games](examples/Atari2600)
+ [Asynchronous Advantage Actor-Critic(A3C) with demos on OpenAI Gym](examples/OpenAIGym) + [Asynchronous Advantage Actor-Critic(A3C) with demos on OpenAI Gym](examples/OpenAIGym)
+ [LSTM-CTC for speech recognition](examples/TIMIT) + [LSTM-CTC for speech recognition](examples/TIMIT)
......
...@@ -38,28 +38,30 @@ class Model(ModelDesc): ...@@ -38,28 +38,30 @@ class Model(ModelDesc):
def generator(self, z): def generator(self, z):
""" return a image generated from z""" """ return a image generated from z"""
l = FullyConnected('fc0', z, 64 * 8 * 4 * 4, nl=tf.identity) nf = 64
l = tf.reshape(l, [-1, 4, 4, 64*8]) l = FullyConnected('fc0', z, nf * 8 * 4 * 4, nl=tf.identity)
l = tf.reshape(l, [-1, 4, 4, nf*8])
l = BNReLU(l) l = BNReLU(l)
with argscope(Deconv2D, nl=BNReLU, kernel_shape=4, stride=2): with argscope(Deconv2D, nl=BNReLU, kernel_shape=4, stride=2):
l = Deconv2D('deconv1', l, [8, 8, 64 * 4]) l = Deconv2D('deconv1', l, [8, 8, nf * 4])
l = Deconv2D('deconv2', l, [16, 16, 64 * 2]) l = Deconv2D('deconv2', l, [16, 16, nf * 2])
l = Deconv2D('deconv3', l, [32, 32, 64]) l = Deconv2D('deconv3', l, [32, 32, nf])
l = Deconv2D('deconv4', l, [64, 64, 3], nl=tf.identity) l = Deconv2D('deconv4', l, [64, 64, 3], nl=tf.identity)
l = tf.tanh(l, name='gen') l = tf.tanh(l, name='gen')
return l return l
def discriminator(self, imgs): def discriminator(self, imgs):
""" return a (b, 1) logits""" """ return a (b, 1) logits"""
nf = 64
with argscope(Conv2D, nl=tf.identity, kernel_shape=4, stride=2), \ with argscope(Conv2D, nl=tf.identity, kernel_shape=4, stride=2), \
argscope(LeakyReLU, alpha=0.2): argscope(LeakyReLU, alpha=0.2):
l = (LinearWrap(imgs) l = (LinearWrap(imgs)
.Conv2D('conv0', 64, nl=LeakyReLU) .Conv2D('conv0', nf, nl=LeakyReLU)
.Conv2D('conv1', 64*2) .Conv2D('conv1', nf*2)
.BatchNorm('bn1').LeakyReLU() .BatchNorm('bn1').LeakyReLU()
.Conv2D('conv2', 64*4) .Conv2D('conv2', nf*4)
.BatchNorm('bn2').LeakyReLU() .BatchNorm('bn2').LeakyReLU()
.Conv2D('conv3', 64*8) .Conv2D('conv3', nf*8)
.BatchNorm('bn3').LeakyReLU() .BatchNorm('bn3').LeakyReLU()
.FullyConnected('fct', 1, nl=tf.identity)()) .FullyConnected('fct', 1, nl=tf.identity)())
return l return l
...@@ -125,24 +127,6 @@ def sample(model_path): ...@@ -125,24 +127,6 @@ def sample(model_path):
o = o[:,:,:,::-1] o = o[:,:,:,::-1]
viz = next(build_patch_list(o, nr_row=10, nr_col=10, viz=True)) viz = next(build_patch_list(o, nr_row=10, nr_col=10, viz=True))
#def vec(model_path):
#func = OfflinePredictor(PredictConfig(
#session_init=get_model_loader(model_path),
#model=Model(),
#input_names=['z'],
#output_names=['gen/gen']))
#dic = np.load('demo/CelebA-vec.npy').item()
#assert np.all(
#dic['w_smile'] - dic['w_neutral'] \
#+ dic['m_neutral'] == dic['m_smile'])
#imgs = []
#for z in ['w_neutral', 'w_smile', 'm_neutral', 'm_smile']:
#z = dic[z]
#img = func([[z]])[0][0][:,:,::-1]
#img = (img + 1) * 128
#imgs.append(img)
#viz = next(build_patch_list(imgs, nr_row=1, nr_col=4, viz=True))
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.') parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
......
...@@ -19,7 +19,7 @@ Training examples with __reproducible__ and meaningful performance. ...@@ -19,7 +19,7 @@ Training examples with __reproducible__ and meaningful performance.
+ [Asynchronous Advantage Actor-Critic(A3C) with demos on OpenAI Gym](OpenAIGym) + [Asynchronous Advantage Actor-Critic(A3C) with demos on OpenAI Gym](OpenAIGym)
## Unsupervised: ## Unsupervised:
+ [Generative Adversarial Network(GAN) variants (DCGAN,Image2Image,InfoGAN)](examples/GAN) + [Generative Adversarial Network(GAN) variants, including DCGAN, Image2Image, InfoGAN](examples/GAN)
## Speech / NLP: ## Speech / NLP:
+ [LSTM-CTC for speech recognition](TIMIT) + [LSTM-CTC for speech recognition](TIMIT)
......
...@@ -15,7 +15,10 @@ cd /PATH/TO/TIMIT ...@@ -15,7 +15,10 @@ cd /PATH/TO/TIMIT
find -name '*.WAV' | parallel -P20 sox {} '{.}.wav' find -name '*.WAV' | parallel -P20 sox {} '{.}.wav'
``` ```
Extract MFCC features and save everything to LMDB database: Extract MFCC features and phoneme labels, and save everything to LMDB database. The preprocessing
follows the setup in
+ Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with RNN - Alex Graves
``` ```
./create-lmdb.py build --dataset /PATH/TO/TIMIT/TRAIN --db train.mdb ./create-lmdb.py build --dataset /PATH/TO/TIMIT/TRAIN --db train.mdb
./create-lmdb.py build --dataset /PATH/TO/TIMIT/TEST --db test.mdb ./create-lmdb.py build --dataset /PATH/TO/TIMIT/TEST --db test.mdb
...@@ -32,4 +35,4 @@ Compute mean/std of the training set (and save to `stats.data` by default): ...@@ -32,4 +35,4 @@ Compute mean/std of the training set (and save to `stats.data` by default):
``` ```
### Results: ### Results:
To be done Get 0.28 LER (normalized edit distance) after about 40 epochs.
...@@ -57,8 +57,15 @@ class Model(ModelDesc): ...@@ -57,8 +57,15 @@ class Model(ModelDesc):
self.cost = tf.reduce_mean(loss, name='cost') self.cost = tf.reduce_mean(loss, name='cost')
logits = tf.transpose(logits, [1,0,2]) logits = tf.transpose(logits, [1,0,2])
predictions = tf.to_int32(
tf.nn.ctc_beam_search_decoder(logits, seqlen)[0][0]) isTrain = get_current_tower_context().is_training
if isTrain:
# beam search is too slow to run in training
predictions = tf.to_int32(
tf.nn.ctc_greedy_decoder(logits, seqlen)[0][0])
else:
predictions = tf.to_int32(
tf.nn.ctc_beam_search_decoder(logits, seqlen)[0][0])
err = tf.edit_distance(predictions, label, normalize=True) err = tf.edit_distance(predictions, label, normalize=True)
err.set_shape([None]) err.set_shape([None])
err = tf.reduce_mean(err, name='error') err = tf.reduce_mean(err, name='error')
...@@ -94,7 +101,7 @@ def get_config(ds_train, ds_test): ...@@ -94,7 +101,7 @@ def get_config(ds_train, ds_test):
]), ]),
model=Model(), model=Model(),
step_per_epoch=step_per_epoch, step_per_epoch=step_per_epoch,
max_epoch=300, max_epoch=70,
) )
if __name__ == '__main__': if __name__ == '__main__':
...@@ -104,7 +111,7 @@ if __name__ == '__main__': ...@@ -104,7 +111,7 @@ if __name__ == '__main__':
parser.add_argument('--train', help='path to training lmdb', required=True) parser.add_argument('--train', help='path to training lmdb', required=True)
parser.add_argument('--test', help='path to testing lmdb', required=True) parser.add_argument('--test', help='path to testing lmdb', required=True)
parser.add_argument('--stat', help='path to the mean/std statistics file', parser.add_argument('--stat', help='path to the mean/std statistics file',
default='stats.data') default='stats.data')
args = parser.parse_args() args = parser.parse_args()
if args.gpu: if args.gpu:
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
......
...@@ -115,7 +115,7 @@ def summary_moving_average(tensors=None): ...@@ -115,7 +115,7 @@ def summary_moving_average(tensors=None):
# TODO will produce tower0/xxx. not elegant # TODO will produce tower0/xxx. not elegant
with tf.name_scope(None): with tf.name_scope(None):
averager = tf.train.ExponentialMovingAverage( averager = tf.train.ExponentialMovingAverage(
0.90, num_updates=get_global_step_var(), name='EMA') 0.95, num_updates=get_global_step_var(), name='EMA')
avg_maintain_op = averager.apply(tensors) avg_maintain_op = averager.apply(tensors)
for idx, c in enumerate(tensors): for idx, c in enumerate(tensors):
name = re.sub('tower[p0-9]+/', '', c.op.name) name = re.sub('tower[p0-9]+/', '', c.op.name)
......
...@@ -10,7 +10,7 @@ from ..tfutils import get_global_step_var ...@@ -10,7 +10,7 @@ from ..tfutils import get_global_step_var
from ..tfutils.tower import TowerContext from ..tfutils.tower import TowerContext
from ..tfutils.gradproc import apply_grad_processors from ..tfutils.gradproc import apply_grad_processors
from ..tfutils.summary import summary_moving_average from ..tfutils.summary import summary_moving_average
from .input_data import QueueInput from .input_data import QueueInput, FeedfreeInput
from .trainer import (MultiPredictorTowerTrainer, SingleCostFeedfreeTrainer) from .trainer import (MultiPredictorTowerTrainer, SingleCostFeedfreeTrainer)
...@@ -25,6 +25,7 @@ class SimpleFeedfreeTrainer( ...@@ -25,6 +25,7 @@ class SimpleFeedfreeTrainer(
config.data must exists config.data must exists
""" """
self._input_method = config.data self._input_method = config.data
assert isinstance(self._input_method, FeedfreeInput), self._input_method
super(SimpleFeedfreeTrainer, self).__init__(config) super(SimpleFeedfreeTrainer, self).__init__(config)
self._setup_predictor_factory(predict_tower) self._setup_predictor_factory(predict_tower)
assert len(self.config.tower) == 1, \ assert len(self.config.tower) == 1, \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment