bug fix in Double-DQN

1f3eaf97 · Yuxin Wu · d6f1b6ee · 1f3eaf97 · 1f3eaf97
Commit 1f3eaf97 authored Jul 21, 2016 by Yuxin Wu
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 8 deletions

examples/Atari2600/DQN.py examples/Atari2600/DQN.py +11 -8

tensorpack/utils/timer.py tensorpack/utils/timer.py +1 -0

No files found.
--- a/examples/Atari2600/DQN.py
+++ b/examples/Atari2600/DQN.py
@@ -92,12 +92,11 @@ class Model(ModelDesc):
    def _build_graph(self, inputs, is_training):
        state, action, reward, next_state, isOver = inputs
        self.predict_value = self._get_DQN_prediction(state, is_training)
-        action_onehot = tf.one_hot(action, NUM_ACTIONS)
+        action_onehot = tf.one_hot(action, NUM_ACTIONS, 1.0, 0.0)
        pred_action_value = tf.reduce_sum(self.predict_value * action_onehot, 1)    #N,
        max_pred_reward = tf.reduce_mean(tf.reduce_max(
            self.predict_value, 1), name='predict_reward')
        add_moving_summary(max_pred_reward)
-        self.greedy_choice = tf.argmax(self.predict_value, 1)   # N,

        with tf.variable_scope('target'):
            targetQ_predict_value = self._get_DQN_prediction(next_state, False)    # NxA
@@ -106,9 +105,13 @@ class Model(ModelDesc):
        #best_v = tf.reduce_max(targetQ_predict_value, 1)    # N,

        # Double-DQN
+        tf.get_variable_scope().reuse_variables()
+        next_predict_value = self._get_DQN_prediction(next_state, is_training)
+        self.greedy_choice = tf.argmax(next_predict_value, 1)   # N,
        predict_onehot = tf.one_hot(self.greedy_choice, NUM_ACTIONS, 1.0, 0.0)
        best_v = tf.reduce_sum(targetQ_predict_value * predict_onehot, 1)

+
        target = reward + (1.0 - tf.cast(isOver, tf.float32)) * GAMMA * tf.stop_gradient(best_v)

        sqrcost = tf.square(target - pred_action_value)

--- a/tensorpack/utils/timer.py
+++ b/tensorpack/utils/timer.py
@@ -17,6 +17,7 @@ __all__ = ['total_timer', 'timed_operation',
        'print_total_timer', 'IterSpeedCounter']

 class IterSpeedCounter(object):
+    """ To count how often some code gets reached"""
    def __init__(self, print_every, name=None):
        self.cnt = 0
        self.print_every = int(print_every)