Skip to content

Commit

Permalink
Merge branch 'pisiiki-master'
Browse files Browse the repository at this point in the history
  • Loading branch information
miyosuda committed Oct 24, 2017
2 parents dde6959 + 071daab commit 31d4886
Show file tree
Hide file tree
Showing 12 changed files with 26 additions and 38 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ All weights of convolution layers and LSTM layer are shared.


## How to train
First, dowload and install DeepMind Lab
First, download and install DeepMind Lab
```
$ git clone https://github.com/deepmind/lab.git
```
Expand All @@ -60,7 +60,7 @@ Clone this repo in lab directory.
$ cd lab
$ git clone https://github.com/miyosuda/unreal.git
```
Add this bazel instrution at the end of `lab/BUILD` file
Add this bazel instruction at the end of `lab/BUILD` file

```
package(default_visibility = ["//visibility:public"])
Expand Down
5 changes: 2 additions & 3 deletions display.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
import cv2
import os
from collections import deque
import pygame, sys
from pygame.locals import *
import pygame

from environment.environment import Environment
from model.model import UnrealModel
Expand Down Expand Up @@ -322,7 +321,7 @@ def main(args):
if flags.recording:
writer.add_frame(d)
else:
frame_file_path = "{0}/{1:06d}.png".format(FRAME_SAVE_DIR, frame_count)
frame_file_path = "{0}/{1:06d}.png".format(flags.frame_save_dir, frame_count)
cv2.imwrite(frame_file_path, d)
frame_count += 1

Expand Down
2 changes: 0 additions & 2 deletions environment/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
from __future__ import print_function

import numpy as np
import cv2


class Environment(object):
# cached action size
Expand Down
2 changes: 1 addition & 1 deletion environment/environment_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_maze(self):

def check_environment(self, env_type, env_name):
environment = Environment.create_environment(env_type, env_name)
action_size = Environment.get_action_size(env_type, env_name)
# action_size = Environment.get_action_size(env_type, env_name) # Not used

for i in range(3):
state, reward, terminal, pixel_change = environment.process(0)
Expand Down
2 changes: 0 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,8 @@

import tensorflow as tf
import threading
import numpy as np

import signal
import random
import math
import os
import time
Expand Down
20 changes: 10 additions & 10 deletions model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def _create_network(self, for_display):
if self._use_value_replay:
self._create_vr_network()

# [Reawrd prediction network]
# [Reward prediction network]
if self._use_reward_prediction:
self._create_rp_network()

Expand Down Expand Up @@ -178,7 +178,7 @@ def _create_pc_network(self):

# pc lastm layers
pc_initial_lstm_state = self.lstm_cell.zero_state(1, tf.float32)
# (Initial state is always resetted.)
# (Initial state is always reset.)

pc_lstm_outputs, _ = self._base_lstm_layer(pc_conv_output,
self.pc_last_action_reward_input,
Expand Down Expand Up @@ -238,7 +238,7 @@ def _create_vr_network(self):

# pc lastm layers
vr_initial_lstm_state = self.lstm_cell.zero_state(1, tf.float32)
# (Initial state is always resetted.)
# (Initial state is always reset.)

vr_lstm_outputs, _ = self._base_lstm_layer(vr_conv_output,
self.vr_last_action_reward_input,
Expand All @@ -253,14 +253,14 @@ def _create_rp_network(self):

# RP conv layers
rp_conv_output = self._base_conv_layers(self.rp_input, reuse=True)
rp_conv_output_rehaped = tf.reshape(rp_conv_output, [1,9*9*32*3])
rp_conv_output_reshaped = tf.reshape(rp_conv_output, [1,9*9*32*3])

with tf.variable_scope("rp_fc") as scope:
# Weights
W_fc1, b_fc1 = self._fc_variable([9*9*32*3, 3], "rp_fc1")

# Reawrd prediction class output. (zero, positive, negative)
self.rp_c = tf.nn.softmax(tf.matmul(rp_conv_output_rehaped, W_fc1) + b_fc1)
self.rp_c = tf.nn.softmax(tf.matmul(rp_conv_output_reshaped, W_fc1) + b_fc1)
# (1,3)

def _base_loss(self):
Expand Down Expand Up @@ -380,7 +380,7 @@ def run_base_policy_value_pc_q(self, sess, s_t, last_action_reward):
def run_base_value(self, sess, s_t, last_action_reward):
# This run_bae_value() is used for calculating V for bootstrapping at the
# end of LOCAL_T_MAX time step sequence.
# When next sequcen starts, V will be calculated again with the same state using updated network weights,
# When next sequence starts, V will be calculated again with the same state using updated network weights,
# so we don't update LSTM state here.
v_out, _ = sess.run( [self.base_v, self.base_lstm_state],
feed_dict = {self.base_input : [s_t],
Expand Down Expand Up @@ -415,8 +415,8 @@ def get_vars(self):
return self.variables


def sync_from(self, src_netowrk, name=None):
src_vars = src_netowrk.get_vars()
def sync_from(self, src_network, name=None):
src_vars = src_network.get_vars()
dst_vars = self.get_vars()

sync_ops = []
Expand Down Expand Up @@ -477,8 +477,8 @@ def _get2d_deconv_output_size(self,
out_width = (input_width - 1) * stride + filter_width

elif padding_type == 'SAME':
out_height = input_height * row_stride
out_width = input_width * col_stride
out_height = input_height * stride
out_width = input_width * stride

return out_height, out_width

Expand Down
4 changes: 2 additions & 2 deletions options.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
def get_options(option_type):
"""
option_type: string
'training' or 'diplay' or 'visualize'
'training' or 'display' or 'visualize'
"""
# Common
tf.app.flags.DEFINE_string("env_type", "lab", "environment type (lab or gym or maze)")
Expand All @@ -33,7 +33,7 @@ def get_options(option_type):
tf.app.flags.DEFINE_float("initial_alpha_log_rate", 0.5, "log_uniform interpolate rate for learning rate")
tf.app.flags.DEFINE_float("gamma", 0.99, "discount factor for rewards")
tf.app.flags.DEFINE_float("gamma_pc", 0.9, "discount factor for pixel control")
tf.app.flags.DEFINE_float("entropy_beta", 0.001, "entropy regurarlization constant")
tf.app.flags.DEFINE_float("entropy_beta", 0.001, "entropy regularization constant")
tf.app.flags.DEFINE_float("pixel_change_lambda", 0.05, "pixel change lambda") # 0.05, 0.01 ~ 0.1 for lab, 0.0001 ~ 0.01 for gym
tf.app.flags.DEFINE_integer("experience_history_size", 2000, "experience replay buffer size")
tf.app.flags.DEFINE_integer("max_time_step", 10 * 10**7, "max time steps")
Expand Down
2 changes: 0 additions & 2 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
from __future__ import division
from __future__ import print_function

import sys

import unittest
import train.experience_test
import train.rmsprop_applier_test
Expand Down
3 changes: 1 addition & 2 deletions train/experience.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from __future__ import division
from __future__ import print_function

import random
import numpy as np
from collections import deque

Expand All @@ -12,7 +11,7 @@ class ExperienceFrame(object):
def __init__(self, state, reward, action, terminal, pixel_change, last_action, last_reward):
self.state = state
self.action = action # (Taken action with the 'state')
self.reward = np.clip(reward, -1, 1) # Reveived reward with the 'state'. (Clipped)
self.reward = np.clip(reward, -1, 1) # Reward with the 'state'. (Clipped)
self.terminal = terminal # (Whether terminated when 'state' was inputted)
self.pixel_change = pixel_change
self.last_action = last_action # (After this last action was taken, agent move to the 'state')
Expand Down
7 changes: 3 additions & 4 deletions train/experience_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@
from __future__ import print_function

import unittest
import numpy as np

from train.experience import Experience, ExperienceFrame


class TestExperience(unittest.TestCase):
def _add_frame(self, experice, reward):
def _add_frame(self, experience, reward):
frame = ExperienceFrame(0, reward, 0, False, 0, 0, 0)
experice.add_frame(frame)
experience.add_frame(frame)

def test_process(self):
experience = Experience(10)
Expand All @@ -33,7 +32,7 @@ def test_process(self):
for i in range(100):
frames = experience.sample_rp_sequence()
self.assertTrue( len(frames) == 4 )
# Reward shold be shewed here.
# Reward should be shewed here.
#print(frames[3].reward)

if __name__ == '__main__':
Expand Down
4 changes: 2 additions & 2 deletions train/rmsprop_applier.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self,

def _create_slots(self, var_list):
for v in var_list:
# 'val' is Variable's intial value tensor.
# 'val' is Variable's initial value tensor.
val = tf.constant(1.0, dtype=v.dtype, shape=v.get_shape())
self._get_or_make_slot(v, val, "rms", self._name)
self._zeros_slot(v, "momentum", self._name)
Expand Down Expand Up @@ -112,7 +112,7 @@ def _apply_gradients(self, global_var_list, local_grad_list, name=None):
with tf.control_dependencies(None):
self._create_slots(global_var_list)

# global gradinet norm clipping
# global gradient norm clipping
local_grad_list, _ = tf.clip_by_global_norm(local_grad_list, self._clip_norm)

with tf.name_scope(name, self._name,[]) as name:
Expand Down
9 changes: 3 additions & 6 deletions train/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,8 @@
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np
import random
import time
import sys

from environment.environment import Environment
from model.model import UnrealModel
Expand Down Expand Up @@ -244,7 +241,7 @@ def _process_pc(self, sess):
# [pixel change]
# Sample 20+1 frame (+1 for last next state)
pc_experience_frames = self.experience.sample_sequence(self.local_t_max+1)
# Revese sequence to calculate from the last
# Reverse sequence to calculate from the last
pc_experience_frames.reverse()

batch_pc_si = []
Expand Down Expand Up @@ -282,7 +279,7 @@ def _process_vr(self, sess):
# [Value replay]
# Sample 20+1 frame (+1 for last next state)
vr_experience_frames = self.experience.sample_sequence(self.local_t_max+1)
# Revese sequence to calculate from the last
# Reverse sequence to calculate from the last
vr_experience_frames.reverse()

batch_vr_si = []
Expand Down Expand Up @@ -397,7 +394,7 @@ def process(self, sess, global_t, summary_writer, summary_op, score_input):
}
feed_dict.update(rp_feed_dict)

# Calculate gradients and copy them to global netowrk.
# Calculate gradients and copy them to global network.
sess.run( self.apply_gradients, feed_dict=feed_dict )

self._print_log(global_t)
Expand Down

0 comments on commit 31d4886

Please sign in to comment.