diff --git a/MartinAymeline/FlappyAgent.py b/MartinAymeline/FlappyAgent.py new file mode 100644 index 0000000..9586ab1 --- /dev/null +++ b/MartinAymeline/FlappyAgent.py @@ -0,0 +1,36 @@ +import numpy as np +from keras.models import load_model +from collections import deque +from utilities import process_screen, greedy_action + + +stacked = [] +calls = 0 +DQN = load_model('model_dqn_new_65000.h5') +possible_actions = [119,None] + + +def FlappyPolicy(state, screen): + global stacked + global calls + global DQN + global action + + calls = calls + 1 + processed_screen = process_screen(screen) + + if (calls == 1) : + # stack the 4 last frames + stacked = deque([processed_screen,processed_screen, \ + processed_screen,processed_screen], maxlen=4) + x = np.stack(stacked, axis=-1) + + else : + stacked.append(processed_screen) + x = np.stack(stacked, axis=-1) + + Q = DQN.predict(np.array([x])) + + return possible_actions[np.argmax(Q)] + + diff --git a/MartinAymeline/constantes.py b/MartinAymeline/constantes.py new file mode 100644 index 0000000..2d49d7b --- /dev/null +++ b/MartinAymeline/constantes.py @@ -0,0 +1,25 @@ +class constantes : # Fix main constants + + # memory buffer constants + replay_memory_size = 200000 # number of previous transitions to remember + mini_batch_size = 32 + + # Learning constants + gamma = 0.99 + total_steps = 200000 # The best network was obtained after 65000 steps + observation = 5000. + explore = 1000000. # frames over which to anneal epsilon + final_eps = 0.001 # final value of epsilon + initial_eps = 0.1 # starting value of epsilon + + # Optimizer's constants + alpha = 1e-4 # learning rate + beta_1 = 0.9 + beta_2 = 0.999 + + # Evaluation constants + evaluation_period = 5000 # Ealuation of the deep q network every 5000 steps + nb_epochs = total_steps // evaluation_period + epoch=-1 + + diff --git a/MartinAymeline/eval.log b/MartinAymeline/eval.log new file mode 100644 index 0000000..d0dc58b --- /dev/null +++ b/MartinAymeline/eval.log @@ -0,0 +1,152 @@ +0,0.0,4.69466784611 +1,0.0,4.96170904573 +2,5.47673034668,3.53810591755 +3,0.0,3.62164132165 +0,0.0,5.86053436558 +1,4.66656112671,3.88620864601 +2,7.15516853333,2.93598770647 +3,8.48180675507,3.93345115937 +4,10.2029304504,3.56317498664 +5,11.2960853577,4.7983870147 +6,12.9479646683,5.75704039982 +7,14.86444664,5.06069977694 +8,16.3370666504,4.46964998889 +9,17.982843399,6.19302055301 +10,18.5910949707,6.50989475939 +11,17.6032295227,8.10438881031 +12,15.5112314224,9.03463693305 +13,15.8485612869,11.2945019369 +14,15.7190685272,8.59904841042 +15,14.6428871155,9.70675154722 +16,13.4239826202,10.5581026454 +17,12.485534668,10.801754052 +18,11.0798435211,14.4698277258 +19,9.79876613617,10.1922431753 +20,7.87689256668,15.5221458151 +21,7.39563035965,15.4769874784 +22,5.57780265808,14.7065343904 +23,4.64339876175,16.9289811078 +24,4.26625919342,14.2144713002 +25,3.76601338387,16.3316570776 +26,3.70627355576,14.1546108109 +27,5.11464166641,15.4347679658 +28,4.40491008759,15.073854647 +29,4.24400806427,15.949591948 +30,6.30815887451,16.3497812243 +31,8.96263790131,15.1154130214 +32,11.736158371,16.4718276847 +33,12.8968105316,18.8227412404 +34,14.7348413467,17.9756738702 +35,15.7232618332,18.0161046671 +36,17.8804893494,17.6891486946 +37,20.2932090759,21.1049183936 +38,22.4963302612,20.7092030539 +0,24.967710495,19.6716769573 +1,38.5622138977,14.2626393644 +2,40.7054405212,15.117170123 +3,39.2491836548,16.5829738643 +4,45.0468063354,11.9271002253 +5,45.4236869812,10.9867965833 +6,42.0939025879,10.0917820893 +7,42.1448974609,9.4055183894 +8,39.3510360718,6.49246510544 +9,42.3908004761,4.77522978771 +10,40.0641403198,4.58435612121 +11,34.6684570313,4.10691721313 +12,32.4749946594,5.02565120362 +13,32.4948348999,5.9498961219 +14,38.884727478,4.75203246545 +15,43.4196586609,4.60334158404 +16,55.7478370667,3.81404406007 +17,53.9235687256,4.36071293652 +18,69.6009521484,4.48999655189 +19,72.2026443481,3.42965011211 +20,389.599456787,-11.9877204284 +21,2478.59570313,-11.9335284749 +22,6405.49511719,-11.9877204284 +23,13140.9375,-11.9877204284 +24,25132.1035156,-11.9877204284 +25,42703.2929688,-12.1962853141 +26,61368.8945313,-12.1962853141 +27,82755.6875,-12.1962853141 +28,104760.78125,-11.2593159839 +29,107774.976563,-11.7461346088 +30,90664.625,-11.9335284749 +31,80297.1796875,-11.2432807355 +32,51785.5625,-5.13517710508 +33,48012.2539063,-12.043981514 +34,41425.4921875,-9.27832739837 +35,32831.4921875,-2.11385608523 +36,17783.8730469,0.243092634338 +37,15080.1923828,1.10150837747 +38,16184.1445313,2.04957438321 +0,0.0,3.86854620148 +1,8.84752559662,4.25661835006 +0,0.0,-12.4,-7.0 +1,5.01180744171,-14.7,-9.0 +2,5.85478305817,-14.0,-9.0 +3,5.52176523209,-14.05,-9.0 +4,5.50321054459,-13.35,-8.0 +5,5.94852733612,-13.95,-9.0 +6,6.59453201294,-13.9,-8.0 +7,7.23370599747,-11.95,-8.0 +8,8.26382637024,-11.55,-2.0 +9,8.98322105408,-11.75,-7.0 +10,9.68953227997,-11.2,-7.0 +11,10.5278759003,-9.3,-2.0 +12,12.3057289124,-8.7,-2.0 +13,13.1675548553,-9.0,-2.0 +14,13.0113019943,-9.1,-2.0 +15,14.6724071503,-4.05,4.0 +16,16.3890533447,-6.2,-2.0 +17,17.4456806183,-5.05,4.0 +18,18.2743644714,-2.55,4.0 +19,16.9419136047,-1.35,4.0 +20,16.3637008667,-4.8,4.0 +21,18.7339382172,-4.75,4.0 +22,18.3891048431,-3.95,4.0 +23,17.445936203,-1.5,4.0 +24,14.6130094528,-0.55,4.0 +25,12.6413230896,-12.8,-8.0 +26,21.5028362274,-16.45,-14.0 +27,39.2529563904,-20.0,-20.0 +0,24.967710495,1.9,4.0 +1,38.5454292297,-0.75,4.0 +2,36.9952774048,2.55,4.0 +3,28.1553764343,-7.55,-2.0 +4,29.2874355316,-10.85,-3.0 +0,0.0,-4.9,-4.0 +1,7.87420940399,-4.9,-4.0 +2,1.58440470695,-5.0,-5.0 +3,3.56381487846,-4.8,-4.0 +4,2.0504193306,-4.75,-4.0 +5,-0.517796576023,-4.8,-4.0 +6,1.06802773476,-4.45,-4.0 +7,0.590887069702,-3.75,-1.0 +8,2.1341612339,-2.35,1.0 +9,3.60842895508,-0.1,20.0 +10,4.7539973259,9.65,52.0 +11,3.8366549015,15.8,57.0 +12,10.3156099319,19.45,66.0 +13,460.176086426,-4.4,-3.0 +14,-864.924438477,-5.0,-5.0 +15,62.3476867676,-4.75,-4.0 +16,311.228363037,-4.85,-4.0 +17,-57.1425170898,-4.65,-3.0 +18,-46.8574295044,-5.0,-5.0 +19,-28.9334468842,-5.0,-5.0 +20,10.1054878235,-4.7,-4.0 +21,-0.0738104507327,-4.05,0.0 +22,6.38661527634,-4.85,-4.0 +23,-9.64006328583,-4.25,-2.0 +24,44.112663269,-4.7,-4.0 +25,13.0731668472,-4.4,-3.0 +26,16.3547077179,-4.85,-4.0 +27,-53.566696167,-4.45,-4.0 +28,3.15030813217,-4.65,-4.0 +29,5.84460353851,-4.35,-2.0 +30,-4.33490753174,-4.55,-3.0 +31,0.0737409219146,-4.25,-2.0 +32,-4.76753520966,-4.35,-3.0 +33,-3.23868513107,-4.45,-3.0 +0,0.0,-4.85,-4.0 diff --git a/MartinAymeline/model_dqn_new_65000.h5 b/MartinAymeline/model_dqn_new_65000.h5 new file mode 100644 index 0000000..ff6ec65 Binary files /dev/null and b/MartinAymeline/model_dqn_new_65000.h5 differ diff --git a/MartinAymeline/replay_memory.py b/MartinAymeline/replay_memory.py new file mode 100644 index 0000000..756fe71 --- /dev/null +++ b/MartinAymeline/replay_memory.py @@ -0,0 +1,63 @@ +from collections import deque +import numpy as np + +# A class for the replay memory. We use the one which was in the RL4 Notebook + +class MemoryBuffer: + "An experience replay buffer using numpy arrays" + # Initialize the class + def __init__(self, length, screen_shape, action_shape): + self.length = length + self.screen_shape = screen_shape + self.action_shape = action_shape + shape = (length,) + screen_shape + self.screens_x = np.zeros(shape, dtype=np.uint8) # starting states + self.screens_y = np.zeros(shape, dtype=np.uint8) # resulting states + shape = (length,) + action_shape + self.actions = np.zeros(shape, dtype=np.uint8) # actions + self.rewards = np.zeros((length,1), dtype=np.int8) # rewards + self.terminals = np.zeros((length,1), dtype=np.bool) # true if resulting state is terminal + self.terminals[-1] = True + self.index = 0 # points one position past the last inserted element + self.size = 0 # current size of the buffer + + # Add state x, action a, reward r and new state y + def append(self, screenx, a, r, screeny, d): + self.screens_x[self.index] = screenx + self.actions[self.index] = a + self.rewards[self.index] = r + self.screens_y[self.index] = screeny + self.terminals[self.index] = d + self.index = (self.index+1) % self.length + self.size = np.min([self.size+1,self.length]) + + def stacked_frames_x(self, index): + im_deque = deque(maxlen=4) + pos = index % self.length + for i in range(4): + im = self.screens_x[pos] + im_deque.appendleft(im) + test_pos = (pos-1) % self.length + if self.terminals[test_pos] == False: + pos = test_pos + return np.stack(im_deque, axis=-1) + + def stacked_frames_y(self, index): + im_deque = deque(maxlen=4) + pos = index % self.length + for i in range(4): + im = self.screens_y[pos] + im_deque.appendleft(im) + test_pos = (pos-1) % self.length + if self.terminals[test_pos] == False: + pos = test_pos + return np.stack(im_deque, axis=-1) + + def minibatch(self, size): + indices = np.random.choice(self.size, size=size, replace=False) + x = np.zeros((size,)+self.screen_shape+(4,)) + y = np.zeros((size,)+self.screen_shape+(4,)) + for i in range(size): + x[i] = self.stacked_frames_x(indices[i]) + y[i] = self.stacked_frames_y(indices[i]) + return x, self.actions[indices], self.rewards[indices], y, self.terminals[indices] \ No newline at end of file diff --git a/MartinAymeline/run.py b/MartinAymeline/run.py new file mode 100644 index 0000000..39b5801 --- /dev/null +++ b/MartinAymeline/run.py @@ -0,0 +1,29 @@ +# You're not allowed to change this file +from ple.games.flappybird import FlappyBird +from ple import PLE +import numpy as np +from FlappyAgent import FlappyPolicy + +game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. +p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) +# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. + +p.init() +reward = 0.0 + +nb_games = 100 +cumulated = np.zeros((nb_games)) + +for i in range(nb_games): + p.reset_game() + + while(not p.game_over()): + state = game.getGameState() + screen = p.getScreenRGB() + action=FlappyPolicy(state, screen) ### Your job is to define this function. + + reward = p.act(action) + cumulated[i] = cumulated[i] + reward + +average_score = np.mean(cumulated) +max_score = np.max(cumulated) diff --git a/MartinAymeline/train.py b/MartinAymeline/train.py new file mode 100644 index 0000000..be5f25b --- /dev/null +++ b/MartinAymeline/train.py @@ -0,0 +1,167 @@ +### Imports + +# Import ple games library +from ple.games.flappybird import FlappyBird +from ple import PLE +# Import common python tools +import numpy as np +import time +from collections import deque +# Import neural network tools +from keras.models import load_model +# Import built tool functions to train +import utilities +from replay_memory import MemoryBuffer +# Import constantes +from constantes import constantes as cst + +### Main training function + +def programme(training): + + ## STARTING PART + + if training == "init" : + # Create the network + dqn = utilities.create_network() + print("New created network") + name = 'model_dqn_new.h5' + else : + # Load an existing one + dqn = load_model('model_dqn_to_train.h5') + print("Existing model load") + name = 'model_dqn_to_train.h5' + + input("Continue ?") + + # Start Flappy game and the environment + game = FlappyBird(graphics="fixed") + env = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, \ + display_screen=True) + possible_actions = env.getActionSet() # return [119, None] + # Initialize the environment and the key indicators + env.init() + reward = 0.0 + loss = 0.0 + + ## INITIALIZATION PART + + # Start a new game + env.reset_game() + # Note : env.act(possible_actions[0]) <-> action "119" <-> GO UP + # env.act(possible_actions[1]) <-> action "None" <-> DO NOTHING + + # Initialize the "state" which is here the screen of the game + screen_x = utilities.process_screen(env.getScreenRGB()) + # We stack 4 last screen images to take speed into account in the trainning + stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4) + x = np.stack(stacked_x, axis=-1) + # Initialize the memory buffer which will be used to replay experience + replay_memory = MemoryBuffer(cst.replay_memory_size, screen_x.shape, (1,)) + + # Store the initial state for further evaluations + Xtest = np.array([x]) + # Initialize evaluation indicators + scoreQ = np.zeros((cst.nb_epochs)) + scoreMC = np.zeros((cst.nb_epochs)) + scoreMax = np.zeros((cst.nb_epochs)) + # Initialize timer + start = time.time() + + ## TRAINING PART + + # Here is a deep-q-learning method with experience replay + for step in range(cst.total_steps): + + # EVALUATION : + + # We evaluate the network performances every 5000 steps + if(step % cst.evaluation_period == 0 and step > 0): + cst.epoch += 1 + print('[Epoch {:d}/{:d}] {:d} steps done'.format(cst.epoch+1, \ + cst.total_steps//cst.evaluation_period, cst.evaluation_period)) + # Evaluation on the initial state + scoreQ[cst.epoch] = np.mean(dqn.predict(Xtest).max(1)) + # Roll_out evaluation : we store mean and max scores, at each + # evaluation step, over 20 games + scoreMC[cst.epoch], scoreMax[cst.epoch] = utilities.MCeval(env, 20, \ + dqn, cst.gamma) + # We save the evaluated network + dqn.save(name) + # And the evaluated scores + with open('eval.log','a') as f: + f.write(str(cst.epoch)+','+str(scoreQ[cst.epoch])+','+ \ + str(scoreMC[cst.epoch])+','+str(scoreMax[cst.epoch])+'\n') + + # PLAY : + + # Action selection : a random float is computed in [0,1]. Then the action + # is chosen randomly if the float is lower than our annealing epsilon, + # otherwise the action is chosen using the current network. + if np.random.rand() < utilities.epsilon(step): + print("Random action") + # When a random action is selected, the following formula decides the + # action. We fix it so that there is a 12,5% chance that the chosen + # action will be to go up (a=0 <--> action 119). + a = 1 - np.random.randint(len(possible_actions))*np.random.randint(len(possible_actions))*np.random.randint(len(possible_actions)) + else: + print("Greedy action") + # Otherwise, the action is chosen by the dqn. + a = utilities.greedy_action(dqn, x) + + # The chosen action is played + r = env.act(possible_actions[a]) + # We determine the reward for this action and its result + reward = utilities.clip_reward(r) + screen_y = utilities.process_screen(env.getScreenRGB()) + # Then we add the step in the memory buffer + replay_memory.append(screen_x, a, reward, screen_y, env.game_over()) + + # NETWORK LEARNING : + + if step > cst.mini_batch_size and step > cst.observation: + # After an exploring phase we start training the network + X,A,R,Y,D = replay_memory.minibatch(cst.mini_batch_size) + QY = dqn.predict(Y) + QYmax = QY.max(1).reshape((cst.mini_batch_size,1)) + update = R + cst.gamma * (1-D) * QYmax + QX = dqn.predict(X) + QX[np.arange(cst.mini_batch_size), A.ravel()] = update.ravel() + loss += dqn.train_on_batch(x=X, y=QX) + + # NEXT STEP PREPARATION : + + if env.game_over()==True: + # The episode is restarted if the game is over at this step + env.reset_game() + screen_x = utilities.process_screen(env.getScreenRGB()) + stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4) + x = np.stack(stacked_x, axis=-1) + else: + # Otherwise, the game keep going + screen_x = screen_y + stacked_x.append(screen_x) + x = np.stack(stacked_x, axis=-1) + + if step > cst.observation : + print("STEP", step, ": Epsilon is ", utilities.epsilon(step), \ + ", the chosen action is", possible_actions[a], ". The reward", r ,"or", reward, \ + "and the Loss is", loss) + else : + print("STEP", step, ": Epsilon is", utilities.epsilon(step), \ + ", the chosen ation is", possible_actions[a], "and the reward is", r ,"or", reward) + + # After the last step we save the trained network + dqn.save('model_dqn_to_train.h5') + + print("End of training in {:d} seconds !".format(time.time() - start)) + + +# Main to launch the training by chosing if we keep training an existing network +# or if we start training a new one +if __name__ == "__main__": + + training_step_choice = input("""Write "init" if you start training the CNN or "keep_going" otherwise """) + programme(training_step_choice) + + \ No newline at end of file diff --git a/MartinAymeline/utilities.py b/MartinAymeline/utilities.py new file mode 100644 index 0000000..c1abe4e --- /dev/null +++ b/MartinAymeline/utilities.py @@ -0,0 +1,105 @@ +import numpy as np +from keras.models import Sequential +from keras.layers import Dense, Conv2D, Flatten, BatchNormalization, Activation, MaxPooling2D +from keras.optimizers import Adam +from keras.models import load_model +from collections import deque +from skimage import transform, color +from constantes import constantes as cst +import os, pickle + + +def process_screen(rgb_screen): + # Initially, screen dimensions are (288, 512) #env.getScreenDims() and each + # pixel of the image is a vector of its 3 color components. + # Pipe is defined such that "pipe_gap = 100" and the gap could start randomly + # between "pipe_min = int(pipe_gap/4) = 25" and "pipe_max = + # int(512 * 0.79 * 0.6 - pipe_gap / 2) = 193". Thus, for a given screen "Screen" + # whose dimensions are (288, 512), the real playing area is "Screen[:, 25:293]" + # whose dimensions are (288, 268). We take some margins on both sizes to + # obtain a (240, 320) image. + + # PROCESSING : We convert the image to grayscale with a 256 color palette. + # We crop it by keeping the useful playing area and then we downsample + # the screen to a (80, 80) image. + + return 256*transform.resize(color.rgb2gray(rgb_screen)[50:270,0:320], (80,80)) + +def create_network(): + # Creation of the Convolutional Neural Network that will predict the Q-values + dqn = Sequential() + # The input is compound of the 4 last frames of the image whose size is (80, 80) + # 1st layer : convolutional layer with 80x80x4 input + dqn.add(Conv2D(filters=16, kernel_size=(8,8), strides=4, padding='same', \ + input_shape=(80,80,4), kernel_initializer='random_normal')) + dqn.add(Activation("relu")) + # 2nd layer : convolutional layer with ReLU activation + dqn.add(Conv2D(filters=32, kernel_size=(4,4), strides=2, padding='same')) + dqn.add(BatchNormalization()) + dqn.add(Activation("relu")) + dqn.add(Flatten()) + # 3rd layer : fully connected layer with 256 ReLU units + dqn.add(Dense(units=256)) + dqn.add(Activation("relu")) + # Output layer : fully connected layer with 4 RelU units + dqn.add(Dense(units=2, activation="linear")) + + # Network compilation + adam = Adam(lr=cst.alpha, beta_1=cst.beta_1, beta_2=cst.beta_2) + dqn.compile(loss='mean_squared_error',optimizer=adam) + + # Network storing + print(dqn.summary()) + dqn.save('model_dqn_new.h5') + + return dqn + + +def epsilon(step): + if step < cst.observation: + return 1 + elif step < 1e6: + return cst.initial_eps - 9.9e-8*step + else: + return cst.final_eps + + +def clip_reward(r): + rr = 0 + if r>0: + rr = 5 # When Flappy passa pipe + if r<0: + rr = -5 # When Flappy dies + return rr + +def greedy_action(network, x): + Q = network.predict(np.array([x])) + print("predict Q values :",Q) + return np.argmax(Q) + +def MCeval(env, trials, network, gamma): + # Evaluate the performances of the network during the game + possible_actions = env.getActionSet() # return [119, None] + scores = np.zeros((trials)) + + for i in range(trials): + env.reset_game() + + screen_x = process_screen(env.getScreenRGB()) + stacked_x = deque([screen_x, screen_x, screen_x,screen_x], maxlen=4) + x = np.stack(stacked_x, axis = -1) + + while not env.game_over(): + action = possible_actions[greedy_action(network, x)] + reward = env.act(action) + screen_y = process_screen(env.getScreenRGB()) + scores[i] = scores[i] + reward + + if not env.game_over(): + # keep going + screen_x = screen_y + stacked_x.append(screen_x) + x = np.stack(stacked_x, axis = -1) + + return np.mean(scores), np.max(scores) +