bug fix in Double-DQN

1f3eaf97 · Yuxin Wu · d6f1b6ee · 1f3eaf97 · 1f3eaf97
Commit 1f3eaf97 authored Jul 21, 2016 by Yuxin Wu
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 8 deletions

examples/Atari2600/DQN.py examples/Atari2600/DQN.py +11 -8

tensorpack/utils/timer.py tensorpack/utils/timer.py +1 -0

No files found.
--- a/examples/Atari2600/DQN.py
+++ b/examples/Atari2600/DQN.py
@@ -92,24 +92,27 @@ class Model(ModelDesc):
    def _build_graph(self, inputs, is_training):
        state, action, reward, next_state, isOver = inputs
        self.predict_value = self._get_DQN_prediction(state, is_training)
-        action_onehot = tf.one_hot(action, NUM_ACTIONS)
+        action_onehot = tf.one_hot(action, NUM_ACTIONS, 1.0, 0.0)
        pred_action_value = tf.reduce_sum(self.predict_value * action_onehot, 1)    #N,
        max_pred_reward = tf.reduce_mean(tf.reduce_max(
            self.predict_value, 1), name='predict_reward')
        add_moving_summary(max_pred_reward)
-        self.greedy_choice = tf.argmax(self.predict_value, 1)   # N,

        with tf.variable_scope('target'):
            targetQ_predict_value = self._get_DQN_prediction(next_state, False)    # NxA

-            # DQN
-            #best_v = tf.reduce_max(targetQ_predict_value, 1)    # N,
+        # DQN
+        #best_v = tf.reduce_max(targetQ_predict_value, 1)    # N,

-            # Double-DQN
-            predict_onehot = tf.one_hot(self.greedy_choice, NUM_ACTIONS, 1.0, 0.0)
-            best_v = tf.reduce_sum(targetQ_predict_value * predict_onehot, 1)
+        # Double-DQN
+        tf.get_variable_scope().reuse_variables()
+        next_predict_value = self._get_DQN_prediction(next_state, is_training)
+        self.greedy_choice = tf.argmax(next_predict_value, 1)   # N,
+        predict_onehot = tf.one_hot(self.greedy_choice, NUM_ACTIONS, 1.0, 0.0)
+        best_v = tf.reduce_sum(targetQ_predict_value * predict_onehot, 1)

-            target = reward + (1.0 - tf.cast(isOver, tf.float32)) * GAMMA * tf.stop_gradient(best_v)
+
+        target = reward + (1.0 - tf.cast(isOver, tf.float32)) * GAMMA * tf.stop_gradient(best_v)

        sqrcost = tf.square(target - pred_action_value)
        abscost = tf.abs(target - pred_action_value)    # robust error func

--- a/tensorpack/utils/timer.py
+++ b/tensorpack/utils/timer.py
@@ -17,6 +17,7 @@ __all__ = ['total_timer', 'timed_operation',
        'print_total_timer', 'IterSpeedCounter']

 class IterSpeedCounter(object):
+    """ To count how often some code gets reached"""
    def __init__(self, print_every, name=None):
        self.cnt = 0
        self.print_every = int(print_every)