Merge branch 'pisiiki-master'

miyosuda · Oct 24, 2017 · 31d4886 · 31d4886
2 parents dde6959 + 071daab
commit 31d4886
Show file tree

Hide file tree

Showing 12 changed files with 26 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ All weights of convolution layers and LSTM layer are shared.
 
 
 ## How to train
-First, dowload and install DeepMind Lab
+First, download and install DeepMind Lab
 ```
 $ git clone https://github.com/deepmind/lab.git
 ```
@@ -60,7 +60,7 @@ Clone this repo in lab directory.
 $ cd lab
 $ git clone https://github.com/miyosuda/unreal.git
 ```
-Add this bazel instrution at the end of `lab/BUILD` file
+Add this bazel instruction at the end of `lab/BUILD` file
 
 ```
 package(default_visibility = ["//visibility:public"])

diff --git a/display.py b/display.py
@@ -8,8 +8,7 @@
 import cv2
 import os
 from collections import deque
-import pygame, sys
-from pygame.locals import *
+import pygame
 
 from environment.environment import Environment
 from model.model import UnrealModel
@@ -322,7 +321,7 @@ def main(args):
       if flags.recording:
         writer.add_frame(d)
       else:
-        frame_file_path = "{0}/{1:06d}.png".format(FRAME_SAVE_DIR, frame_count)
+        frame_file_path = "{0}/{1:06d}.png".format(flags.frame_save_dir, frame_count)
         cv2.imwrite(frame_file_path, d)
         frame_count += 1
 

diff --git a/environment/environment.py b/environment/environment.py
@@ -4,8 +4,6 @@
 from __future__ import print_function
 
 import numpy as np
-import cv2
-
 
 class Environment(object):
   # cached action size

diff --git a/environment/environment_test.py b/environment/environment_test.py
@@ -35,7 +35,7 @@ def test_maze(self):
 
   def check_environment(self, env_type, env_name):
     environment = Environment.create_environment(env_type, env_name)
-    action_size = Environment.get_action_size(env_type, env_name)
+    # action_size = Environment.get_action_size(env_type, env_name) # Not used
 
     for i in range(3):
       state, reward, terminal, pixel_change = environment.process(0)

diff --git a/main.py b/main.py
@@ -5,10 +5,8 @@
 
 import tensorflow as tf
 import threading
-import numpy as np
 
 import signal
-import random
 import math
 import os
 import time

diff --git a/model/model.py b/model/model.py
@@ -67,7 +67,7 @@ def _create_network(self, for_display):
       if self._use_value_replay:
         self._create_vr_network()
 
-      # [Reawrd prediction network]
+      # [Reward prediction network]
       if self._use_reward_prediction:
         self._create_rp_network()
 
@@ -178,7 +178,7 @@ def _create_pc_network(self):
 
     # pc lastm layers
     pc_initial_lstm_state = self.lstm_cell.zero_state(1, tf.float32)
-    # (Initial state is always resetted.)
+    # (Initial state is always reset.)
 
     pc_lstm_outputs, _ = self._base_lstm_layer(pc_conv_output,
                                                self.pc_last_action_reward_input,
@@ -238,7 +238,7 @@ def _create_vr_network(self):
 
     # pc lastm layers
     vr_initial_lstm_state = self.lstm_cell.zero_state(1, tf.float32)
-    # (Initial state is always resetted.)
+    # (Initial state is always reset.)
 
     vr_lstm_outputs, _ = self._base_lstm_layer(vr_conv_output,
                                                self.vr_last_action_reward_input,
@@ -253,14 +253,14 @@ def _create_rp_network(self):
 
     # RP conv layers
     rp_conv_output = self._base_conv_layers(self.rp_input, reuse=True)
-    rp_conv_output_rehaped = tf.reshape(rp_conv_output, [1,9*9*32*3])
+    rp_conv_output_reshaped = tf.reshape(rp_conv_output, [1,9*9*32*3])
 
     with tf.variable_scope("rp_fc") as scope:
       # Weights
       W_fc1, b_fc1 = self._fc_variable([9*9*32*3, 3], "rp_fc1")
 
     # Reawrd prediction class output. (zero, positive, negative)
-    self.rp_c = tf.nn.softmax(tf.matmul(rp_conv_output_rehaped, W_fc1) + b_fc1)
+    self.rp_c = tf.nn.softmax(tf.matmul(rp_conv_output_reshaped, W_fc1) + b_fc1)
     # (1,3)
 
   def _base_loss(self):
@@ -380,7 +380,7 @@ def run_base_policy_value_pc_q(self, sess, s_t, last_action_reward):
   def run_base_value(self, sess, s_t, last_action_reward):
     # This run_bae_value() is used for calculating V for bootstrapping at the 
     # end of LOCAL_T_MAX time step sequence.
-    # When next sequcen starts, V will be calculated again with the same state using updated network weights,
+    # When next sequence starts, V will be calculated again with the same state using updated network weights,
     # so we don't update LSTM state here.
     v_out, _ = sess.run( [self.base_v, self.base_lstm_state],
                          feed_dict = {self.base_input : [s_t],
@@ -415,8 +415,8 @@ def get_vars(self):
     return self.variables
 
 
-  def sync_from(self, src_netowrk, name=None):
-    src_vars = src_netowrk.get_vars()
+  def sync_from(self, src_network, name=None):
+    src_vars = src_network.get_vars()
     dst_vars = self.get_vars()
 
     sync_ops = []
@@ -477,8 +477,8 @@ def _get2d_deconv_output_size(self,
       out_width  = (input_width  - 1) * stride + filter_width
 
     elif padding_type == 'SAME':
-      out_height = input_height * row_stride
-      out_width  = input_width  * col_stride
+      out_height = input_height * stride
+      out_width  = input_width  * stride
 
     return out_height, out_width
 

diff --git a/options.py b/options.py
@@ -9,7 +9,7 @@
 def get_options(option_type):
   """
   option_type: string
-    'training' or 'diplay' or 'visualize'
+    'training' or 'display' or 'visualize'
   """
   # Common
   tf.app.flags.DEFINE_string("env_type", "lab", "environment type (lab or gym or maze)")
@@ -33,7 +33,7 @@ def get_options(option_type):
     tf.app.flags.DEFINE_float("initial_alpha_log_rate", 0.5, "log_uniform interpolate rate for learning rate")
     tf.app.flags.DEFINE_float("gamma", 0.99, "discount factor for rewards")
     tf.app.flags.DEFINE_float("gamma_pc", 0.9, "discount factor for pixel control")
-    tf.app.flags.DEFINE_float("entropy_beta", 0.001, "entropy regurarlization constant")
+    tf.app.flags.DEFINE_float("entropy_beta", 0.001, "entropy regularization constant")
     tf.app.flags.DEFINE_float("pixel_change_lambda", 0.05, "pixel change lambda") # 0.05, 0.01 ~ 0.1 for lab, 0.0001 ~ 0.01 for gym
     tf.app.flags.DEFINE_integer("experience_history_size", 2000, "experience replay buffer size")
     tf.app.flags.DEFINE_integer("max_time_step", 10 * 10**7, "max time steps")

diff --git a/test.py b/test.py
@@ -3,8 +3,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
 import unittest
 import train.experience_test
 import train.rmsprop_applier_test

diff --git a/train/experience.py b/train/experience.py
@@ -3,7 +3,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import random
 import numpy as np
 from collections import deque
 
@@ -12,7 +11,7 @@ class ExperienceFrame(object):
   def __init__(self, state, reward, action, terminal, pixel_change, last_action, last_reward):
     self.state = state
     self.action = action # (Taken action with the 'state')
-    self.reward = np.clip(reward, -1, 1) # Reveived reward with the 'state'. (Clipped)
+    self.reward = np.clip(reward, -1, 1) # Reward with the 'state'. (Clipped)
     self.terminal = terminal # (Whether terminated when 'state' was inputted)
     self.pixel_change = pixel_change
     self.last_action = last_action # (After this last action was taken, agent move to the 'state')

diff --git a/train/experience_test.py b/train/experience_test.py
@@ -4,15 +4,14 @@
 from __future__ import print_function
 
 import unittest
-import numpy as np
 
 from train.experience import Experience, ExperienceFrame
 
 
 class TestExperience(unittest.TestCase):
-  def _add_frame(self, experice, reward):
+  def _add_frame(self, experience, reward):
     frame = ExperienceFrame(0, reward, 0, False, 0, 0, 0)
-    experice.add_frame(frame)
+    experience.add_frame(frame)
 
   def test_process(self):
     experience = Experience(10)
@@ -33,7 +32,7 @@ def test_process(self):
     for i in range(100):
       frames = experience.sample_rp_sequence()
       self.assertTrue( len(frames) == 4 )
-      # Reward shold be shewed here.
+      # Reward should be shewed here.
       #print(frames[3].reward)
 
 if __name__ == '__main__':

diff --git a/train/rmsprop_applier.py b/train/rmsprop_applier.py
@@ -37,7 +37,7 @@ def __init__(self,
 
   def _create_slots(self, var_list):
     for v in var_list:
-      # 'val' is Variable's intial value tensor.
+      # 'val' is Variable's initial value tensor.
       val = tf.constant(1.0, dtype=v.dtype, shape=v.get_shape())
       self._get_or_make_slot(v, val, "rms", self._name)
       self._zeros_slot(v, "momentum", self._name)
@@ -112,7 +112,7 @@ def _apply_gradients(self, global_var_list, local_grad_list, name=None):
     with tf.control_dependencies(None):
       self._create_slots(global_var_list)
 
-    # global gradinet norm clipping
+    # global gradient norm clipping
     local_grad_list, _ =  tf.clip_by_global_norm(local_grad_list, self._clip_norm)
 
     with tf.name_scope(name, self._name,[]) as name:

diff --git a/train/trainer.py b/train/trainer.py
@@ -3,11 +3,8 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
 import numpy as np
-import random
 import time
-import sys
 
 from environment.environment import Environment
 from model.model import UnrealModel
@@ -244,7 +241,7 @@ def _process_pc(self, sess):
     # [pixel change]
     # Sample 20+1 frame (+1 for last next state)
     pc_experience_frames = self.experience.sample_sequence(self.local_t_max+1)
-    # Revese sequence to calculate from the last
+    # Reverse sequence to calculate from the last
     pc_experience_frames.reverse()
 
     batch_pc_si = []
@@ -282,7 +279,7 @@ def _process_vr(self, sess):
     # [Value replay]
     # Sample 20+1 frame (+1 for last next state)
     vr_experience_frames = self.experience.sample_sequence(self.local_t_max+1)
-    # Revese sequence to calculate from the last
+    # Reverse sequence to calculate from the last
     vr_experience_frames.reverse()
 
     batch_vr_si = []
@@ -397,7 +394,7 @@ def process(self, sess, global_t, summary_writer, summary_op, score_input):
       }
       feed_dict.update(rp_feed_dict)
 
-    # Calculate gradients and copy them to global netowrk.
+    # Calculate gradients and copy them to global network.
     sess.run( self.apply_gradients, feed_dict=feed_dict )
 
     self._print_log(global_t)