Commit 941093a9 authored by Yuxin Wu's avatar Yuxin Wu

gym a3c

parent be3409fb
......@@ -15,9 +15,9 @@ from tensorpack.utils.stat import *
from tensorpack.callbacks import *
global get_player
get_player = None
def play_one_episode(player, func, verbose=False):
# 0.01-greedy evaluation
def f(s):
spc = player.get_action_space()
act = func([[s]])[0][0].argmax()
......@@ -62,6 +62,7 @@ def eval_with_funcs(predict_funcs, nr_eval):
for k in threads:
k.start()
time.sleep(0.1) # avoid simulator bugs
stat = StatCounter()
try:
for _ in tqdm(range(nr_eval), **get_tqdm_kwargs()):
......@@ -98,7 +99,7 @@ class Evaluator(Callback):
t = time.time()
mean, max = eval_with_funcs(self.pred_funcs, nr_eval=self.eval_episode)
t = time.time() - t
if t > 8 * 60: # eval takes too long
self.eval_episode = int(self.eval_episode * 0.89)
if t > 10 * 60: # eval takes too long
self.eval_episode = int(self.eval_episode * 0.94)
self.trainer.write_scalar_summary('mean_score', mean)
self.trainer.write_scalar_summary('max_score', max)
../Atari2600/common.py
\ No newline at end of file
......@@ -22,6 +22,8 @@ IMAGE_SHAPE3 = IMAGE_SIZE + (CHANNEL,)
NUM_ACTIONS = None
ENV_NAME = None
from common import play_one_episode
def get_player(dumpdir=None):
pl = GymEnv(ENV_NAME, dumpdir=dumpdir, auto_restart=False)
pl = MapPlayerState(pl, lambda img: cv2.resize(img, IMAGE_SIZE[::-1]))
......@@ -40,7 +42,6 @@ class Model(ModelDesc):
InputVar(tf.float32, (None,), 'futurereward') ]
def _get_NN_prediction(self, image):
""" image: [0,255]"""
image = image / 255.0
with argscope(Conv2D, nl=tf.nn.relu):
l = Conv2D('conv0', image, out_channel=32, kernel_shape=5)
......@@ -61,17 +62,6 @@ class Model(ModelDesc):
policy = self._get_NN_prediction(state)
self.logits = tf.nn.softmax(policy, name='logits')
def play_one_episode(player, func, verbose=False):
def f(s):
spc = player.get_action_space()
act = func([[s]])[0][0].argmax()
if random.random() < 0.001:
act = spc.sample()
if verbose:
print(act)
return act
return np.mean(player.play_one_episode(f))
def run_submission(cfg):
dirname = 'gym-submit'
player = get_player(dumpdir=dirname)
......@@ -80,7 +70,11 @@ def run_submission(cfg):
if k != 0:
player.restart_episode()
score = play_one_episode(player, predfunc)
print("Score:", score)
print("Total:", score)
def do_submit():
dirname = 'gym-submit'
gym.upload(dirname, api_key='xxx')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
......
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# File: train-atari.py
# Author: Yuxin Wu <ppwwyyxxc@gmail.com>
import numpy as np
import tensorflow as tf
import os, sys, re, time
import random
import uuid
import argparse
import multiprocessing, threading
from collections import deque
import six
from six.moves import queue
from tensorpack import *
from tensorpack.utils import *
from tensorpack.utils.concurrency import *
from tensorpack.utils.serialize import *
from tensorpack.utils.timer import *
from tensorpack.utils.stat import *
from tensorpack.tfutils import symbolic_functions as symbf
from tensorpack.RL import *
import common
from common import (play_model, Evaluator, eval_model_multithread)
IMAGE_SIZE = (84, 84)
FRAME_HISTORY = 4
GAMMA = 0.99
CHANNEL = FRAME_HISTORY * 3
IMAGE_SHAPE3 = IMAGE_SIZE + (CHANNEL,)
LOCAL_TIME_MAX = 5
STEP_PER_EPOCH = 6000
EVAL_EPISODE = 50
BATCH_SIZE = 128
SIMULATOR_PROC = 50
PREDICTOR_THREAD_PER_GPU = 2
PREDICTOR_THREAD = None
EVALUATE_PROC = min(multiprocessing.cpu_count() // 2, 20)
NUM_ACTIONS = None
ENV_NAME = None
def get_player(viz=False, train=False, dumpdir=None):
pl = GymEnv(ENV_NAME, dumpdir=dumpdir)
def func(img):
return cv2.resize(img, IMAGE_SIZE[::-1])
pl = MapPlayerState(pl, func)
global NUM_ACTIONS
NUM_ACTIONS = pl.get_action_space().num_actions()
pl = HistoryFramePlayer(pl, FRAME_HISTORY)
if not train:
pl = PreventStuckPlayer(pl, 30, 1)
pl = LimitLengthPlayer(pl, 40000)
return pl
common.get_player = get_player
class MySimulatorWorker(SimulatorProcess):
def _build_player(self):
return get_player(train=True)
class Model(ModelDesc):
def _get_input_vars(self):
assert NUM_ACTIONS is not None
return [InputVar(tf.float32, (None,) + IMAGE_SHAPE3, 'state'),
InputVar(tf.int32, (None,), 'action'),
InputVar(tf.float32, (None,), 'futurereward') ]
def _get_NN_prediction(self, image, is_training):
image = image / 255.0
with argscope(Conv2D, nl=tf.nn.relu):
l = Conv2D('conv0', image, out_channel=32, kernel_shape=5)
l = MaxPooling('pool0', l, 2)
l = Conv2D('conv1', l, out_channel=32, kernel_shape=5)
l = MaxPooling('pool1', l, 2)
l = Conv2D('conv2', l, out_channel=64, kernel_shape=4)
l = MaxPooling('pool2', l, 2)
l = Conv2D('conv3', l, out_channel=64, kernel_shape=3)
l = FullyConnected('fc0', l, 512, nl=tf.identity)
l = PReLU('prelu', l)
policy = FullyConnected('fc-pi', l, out_dim=NUM_ACTIONS, nl=tf.identity)
value = FullyConnected('fc-v', l, 1, nl=tf.identity)
return policy, value
def _build_graph(self, inputs, is_training):
state, action, futurereward = inputs
policy, self.value = self._get_NN_prediction(state, is_training)
self.value = tf.squeeze(self.value, [1], name='pred_value') # (B,)
self.logits = tf.nn.softmax(policy, name='logits')
expf = tf.get_variable('explore_factor', shape=[],
initializer=tf.constant_initializer(1), trainable=False)
logitsT = tf.nn.softmax(policy * expf, name='logitsT')
if not is_training:
return
log_probs = tf.log(self.logits + 1e-6)
log_pi_a_given_s = tf.reduce_sum(
log_probs * tf.one_hot(tf.cast(action,tf.int64), NUM_ACTIONS, 1.0, 0.0), 1)
advantage = tf.sub(tf.stop_gradient(self.value), futurereward, name='advantage')
policy_loss = tf.reduce_sum(log_pi_a_given_s * advantage, name='policy_loss')
xentropy_loss = tf.reduce_sum(
self.logits * log_probs, name='xentropy_loss')
value_loss = tf.nn.l2_loss(self.value - futurereward, name='value_loss')
pred_reward = tf.reduce_mean(self.value, name='predict_reward')
advantage = tf.sqrt(tf.reduce_mean(tf.square(advantage)), name='rms_advantage')
summary.add_moving_summary(policy_loss, xentropy_loss, value_loss, pred_reward, advantage)
entropy_beta = tf.get_variable('entropy_beta', shape=[],
initializer=tf.constant_initializer(0.01), trainable=False)
self.cost = tf.add_n([policy_loss, xentropy_loss * entropy_beta, value_loss])
self.cost = tf.truediv(self.cost,
tf.cast(tf.shape(futurereward)[0], tf.float32),
name='cost')
def get_gradient_processor(self):
return [MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1)),
SummaryGradient()]
class MySimulatorMaster(SimulatorMaster, Callback):
def __init__(self, pipe_c2s, pipe_s2c, model):
super(MySimulatorMaster, self).__init__(pipe_c2s, pipe_s2c)
self.M = model
self.queue = queue.Queue(maxsize=BATCH_SIZE*8*2)
def _setup_graph(self):
self.sess = self.trainer.sess
self.async_predictor = MultiThreadAsyncPredictor(
self.trainer.get_predict_funcs(['state'], ['logitsT', 'pred_value'],
PREDICTOR_THREAD), batch_size=15)
self.async_predictor.run()
def _on_state(self, state, ident):
def cb(outputs):
distrib, value = outputs.result()
assert np.all(np.isfinite(distrib))
action = np.random.choice(len(distrib), p=distrib)
client = self.clients[ident]
client.memory.append(TransitionExperience(state, action, None, value=value))
self.send_queue.put([ident, dumps(action)])
self.async_predictor.put_task([state], cb)
def _on_episode_over(self, ident):
self._parse_memory(0, ident, True)
def _on_datapoint(self, ident):
client = self.clients[ident]
if len(client.memory) == LOCAL_TIME_MAX + 1:
R = client.memory[-1].value
self._parse_memory(R, ident, False)
def _parse_memory(self, init_r, ident, isOver):
client = self.clients[ident]
mem = client.memory
if not isOver:
last = mem[-1]
mem = mem[:-1]
mem.reverse()
R = float(init_r)
for idx, k in enumerate(mem):
R = np.clip(k.reward, -1, 1) + GAMMA * R
self.queue.put([k.state, k.action, R])
if not isOver:
client.memory = [last]
else:
client.memory = []
def get_config():
logger.auto_set_dir()
M = Model()
name_base = str(uuid.uuid1())[:6]
PIPE_DIR = os.environ.get('TENSORPACK_PIPEDIR', '.').rstrip('/')
namec2s = 'ipc://{}/sim-c2s-{}'.format(PIPE_DIR, name_base)
names2c = 'ipc://{}/sim-s2c-{}'.format(PIPE_DIR, name_base)
procs = [MySimulatorWorker(k, namec2s, names2c) for k in range(SIMULATOR_PROC)]
ensure_proc_terminate(procs)
start_proc_mask_signal(procs)
master = MySimulatorMaster(namec2s, names2c, M)
dataflow = BatchData(DataFromQueue(master.queue), BATCH_SIZE)
lr = tf.Variable(0.001, trainable=False, name='learning_rate')
tf.scalar_summary('learning_rate', lr)
return TrainConfig(
dataset=dataflow,
optimizer=tf.train.AdamOptimizer(lr, epsilon=1e-3),
callbacks=Callbacks([
StatPrinter(), ModelSaver(),
ScheduledHyperParamSetter('learning_rate', [(80, 0.0003), (120, 0.0001)]),
ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]),
ScheduledHyperParamSetter('explore_factor',
[(80, 2), (100, 3), (120, 4), (140, 5)]),
HumanHyperParamSetter('learning_rate'),
HumanHyperParamSetter('entropy_beta'),
HumanHyperParamSetter('explore_factor'),
master,
PeriodicCallback(Evaluator(EVAL_EPISODE, ['state'], ['logits']), 2),
]),
extra_threads_procs=[master],
session_config=get_default_sess_config(0.5),
model=M,
step_per_epoch=STEP_PER_EPOCH,
max_epoch=1000,
)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.') # nargs='*' in multi mode
parser.add_argument('--load', help='load model')
parser.add_argument('--env', help='env', required=True)
parser.add_argument('--task', help='task to perform',
choices=['play', 'eval', 'train'], default='train')
args = parser.parse_args()
ENV_NAME = args.env
p = get_player(); del p # set NUM_ACTIONS
if args.gpu:
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
if args.task != 'train':
assert args.load is not None
if args.task != 'train':
cfg = PredictConfig(
model=Model(),
session_init=SaverRestore(args.load),
input_var_names=['state'],
output_var_names=['logits:0'])
if args.task == 'play':
play_model(cfg)
elif args.task == 'eval':
eval_model_multithread(cfg, EVAL_EPISODE)
else:
nr_gpu = get_nr_gpu()
if nr_gpu > 1:
predict_tower = range(nr_gpu)[-nr_gpu/2:]
else:
predict_tower = [0]
PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU
config = get_config()
if args.load:
config.session_init = SaverRestore(args.load)
config.tower = range(nr_gpu)[:-nr_gpu/2] or [0]
logger.info("[BA3C] Train on gpu {} and infer on gpu {}".format(
','.join(map(str, config.tower)), ','.join(map(str, predict_tower))))
AsyncMultiGPUTrainer(config, predict_tower=predict_tower).train()
......@@ -73,7 +73,7 @@ class BSDS500(RNGDataFlow):
gt = loadmat(gt_file)['groundTruth'][0]
n_annot = gt.shape[0]
gt = sum(gt[k]['Boundaries'][0][0] for k in range(n_annot))
gt[gt <= 2] = 0
#gt[gt <= 2] = 0
gt = gt.astype('float32')
gt /= np.max(gt)
if gt.shape[0] > gt.shape[1]:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment