diff --git a/RandomBird/FlappyAgent.py b/RandomBird/FlappyAgent.py new file mode 100644 index 0000000..d93b12a --- /dev/null +++ b/RandomBird/FlappyAgent.py @@ -0,0 +1,154 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jan 24 14:41:52 2018 + +@author: Arnaud +""" +from ple.games.flappybird import FlappyBird +from ple import PLE +import numpy as np +import pickle + +def epsilon_greedy(Q, y, dist,veloc, epsilon): + a = np.argmax(Q[y,dist,veloc,:]) + if (a == 0) : + a=None + else: + a=119 + if(np.random.rand()<=epsilon): # random action + aa = np.random.randint(0,2) + if (aa == 0) : + a=None + else: + a=119 + + return a + + + +# Stockage des 9 dernières paires états-actions en cas de défaite +def updateLast9(last9,y,dist,veloc,ia): + for i in range(0,9): + last9[i]=last9[i+1] + last9[9][0] = y + last9[9][1] = dist + last9[9][2] = veloc + last9[9][3] = ia + +nb_games = 1000000 +sizey = 15 #-300 à 300 #Différence entre le haut de next_pipe et l'oiseau +sizex = 10 # 0 à 283 #distance entre l'oiseau et next_pipe +sizev = 9 #vitesse discretisée + + +#Q-learning, matrice état + actions +#Qql = np.zeros((sizey,sizex,sizev,2)) +epsilon = 0 #L'aléatoire n'est pas nécessaire, au contraire il réduit les performances car en pénalisant 9 pairs états-actions on risque de pénaliser N pairs intéressantes pour 9-N pairs aléatoires qui ont mené l'oiseau à sa perte + +r=0 #initialisation de la récompense + + +#Parametres du modèle +alpha = 0.05# Des valeurs de alpha de l'ordre de 0.4 étaient intéressante pour gagner des performances rapidement mais menaient systématiquement à du sur-apprentissage et à des pertes de performances (ex : montée à 80 de moyenne puis chute à 15 puis stagnation à 15) +gamma = 0.95 + +#Initialisation des états +y = 0 +dist = 0 +veloc = 0 +ia = 0 +cumulated = np.zeros((nb_games))+5 +# +## Read from file +f_myfile = open('Q_functionArnaud.pickle', 'rb') +Qql = pickle.load(f_myfile) +f_myfile.close() + + +def FlappyPolicy(state, screen): + + y=int((state['player_y']-state['next_pipe_bottom_y']+300)/40) + dist = int(state['next_pipe_dist_to_player']/40) + veloc = int((state['player_vel']+16)/3) + a = np.argmax(Qql[y,dist,veloc,:]) + + if (a == 0) : + a=None + else: + a=119 + + return a +##ENTRAINEMENT +#train = True pour lancer l'entrainement +train = False +if train: + game = FlappyBird() + p = PLE(game,fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True) + p.init() + + for i in range(0,nb_games): + p.reset_game() + state = game.getGameState() + screen = p.getScreenRGB() + y=int((state['player_y']-state['next_pipe_bottom_y']+300)/40) + dist = int(state['next_pipe_dist_to_player']/40) + veloc = int((state['player_vel']+16)/3) + last9 = np.zeros((10,4)) + while(not p.game_over()): + + + if((i+1)%100==0): + epsilon = epsilon/2 + if ((i+1)>300): + epsilon = 0 + + a = epsilon_greedy(Qql,y,dist,veloc,epsilon) + if (a == None): + ia = 0 + else: + ia = 1 + + action=a ### Your job is to define this function. + rewGame = p.act(action) + state = game.getGameState() + screen = p.getScreenRGB() + ##Update de l'etat + yprec = y + distprec = dist + velocprec = veloc + y=int((state['player_y']-state['next_pipe_bottom_y']+300)/40) + dist = int(state['next_pipe_dist_to_player']/40) + veloc = int((state['player_vel']+16)/3) + + + r=1 + updateLast9(last9,y,dist,veloc,ia) + Qql[yprec][distprec][velocprec][ia] = Qql[yprec][distprec][velocprec][ia] + alpha * (r+gamma*np.max(Qql[y][dist][veloc][:])-Qql[yprec][distprec][velocprec][ia]) + + + cumulated[i] = cumulated[i] + rewGame + + for l in range(0,8): + r=-1000 + yprec = int(last9[l][0]) + y = int(last9[l+1][0]) + distprec = int(last9[l][1]) + dist = int(last9[l+1][1]) + velocprec = int(last9[l][2]) + veloc = int(last9[l+1][2]) + ia = int(last9[l+1][3]) + Qql[yprec][distprec][velocprec][ia] = Qql[yprec][distprec][velocprec][ia] + alpha * (r+gamma*np.max(Qql[y][dist][veloc][:])-Qql[yprec][distprec][velocprec][ia]) + + if (i<100): + print("i = ",i," - cumulated[i] = ",cumulated[i]," - mean = ",np.mean(cumulated[0:i])) + else: + print("i = ",i," - cumulated[i] = ",cumulated[i]," - mean[-1000] = ",np.mean(cumulated[i-100:i])) + +average_score = np.mean(cumulated[20000:24055]) +max_score = np.max(cumulated) +# +#print("saving model") +#f_myfile = open('Q_functionArnaud.pickle', 'wb') +#pickle.dump(Qql, f_myfile) +#f_myfile.close() + diff --git a/RandomBird/Q_functionArnaud.pickle b/RandomBird/Q_functionArnaud.pickle new file mode 100644 index 0000000..7d4408d Binary files /dev/null and b/RandomBird/Q_functionArnaud.pickle differ diff --git a/RandomBird/__pycache__/FlappyAgent.cpython-36.pyc b/RandomBird/__pycache__/FlappyAgent.cpython-36.pyc new file mode 100644 index 0000000..61dd5e2 Binary files /dev/null and b/RandomBird/__pycache__/FlappyAgent.cpython-36.pyc differ diff --git a/RandomBird/run.py b/RandomBird/run.py new file mode 100644 index 0000000..49f21b8 --- /dev/null +++ b/RandomBird/run.py @@ -0,0 +1,29 @@ +# You're not allowed to change this file +from ple.games.flappybird import FlappyBird +from ple import PLE +import numpy as np +from FlappyAgent import FlappyPolicy + +game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. +p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True) +# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. + +p.init() +reward = 0.0 + +nb_games = 100 +cumulated = np.zeros((nb_games)) + +for i in range(nb_games): + p.reset_game() + + while(not p.game_over()): + state = game.getGameState() + screen = p.getScreenRGB() + action=FlappyPolicy(state, screen) ### Your job is to define this function. + + reward = p.act(action) + cumulated[i] = cumulated[i] + reward + +average_score = np.mean(cumulated) +max_score = np.max(cumulated) diff --git a/RandomBird/sanstitre1.py b/RandomBird/sanstitre1.py new file mode 100644 index 0000000..898e5d0 --- /dev/null +++ b/RandomBird/sanstitre1.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Mar 11 23:04:31 2018 + +@author: Arnaud +""" +import pickle +f_myfile = open('Q_function_5000_ite.pickle', 'rb') +Qql = pickle.load(f_myfile) +f_myfile.close() + diff --git a/THOMAS/FlappyAgent.py b/THOMAS/FlappyAgent.py new file mode 100644 index 0000000..d93b12a --- /dev/null +++ b/THOMAS/FlappyAgent.py @@ -0,0 +1,154 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jan 24 14:41:52 2018 + +@author: Arnaud +""" +from ple.games.flappybird import FlappyBird +from ple import PLE +import numpy as np +import pickle + +def epsilon_greedy(Q, y, dist,veloc, epsilon): + a = np.argmax(Q[y,dist,veloc,:]) + if (a == 0) : + a=None + else: + a=119 + if(np.random.rand()<=epsilon): # random action + aa = np.random.randint(0,2) + if (aa == 0) : + a=None + else: + a=119 + + return a + + + +# Stockage des 9 dernières paires états-actions en cas de défaite +def updateLast9(last9,y,dist,veloc,ia): + for i in range(0,9): + last9[i]=last9[i+1] + last9[9][0] = y + last9[9][1] = dist + last9[9][2] = veloc + last9[9][3] = ia + +nb_games = 1000000 +sizey = 15 #-300 à 300 #Différence entre le haut de next_pipe et l'oiseau +sizex = 10 # 0 à 283 #distance entre l'oiseau et next_pipe +sizev = 9 #vitesse discretisée + + +#Q-learning, matrice état + actions +#Qql = np.zeros((sizey,sizex,sizev,2)) +epsilon = 0 #L'aléatoire n'est pas nécessaire, au contraire il réduit les performances car en pénalisant 9 pairs états-actions on risque de pénaliser N pairs intéressantes pour 9-N pairs aléatoires qui ont mené l'oiseau à sa perte + +r=0 #initialisation de la récompense + + +#Parametres du modèle +alpha = 0.05# Des valeurs de alpha de l'ordre de 0.4 étaient intéressante pour gagner des performances rapidement mais menaient systématiquement à du sur-apprentissage et à des pertes de performances (ex : montée à 80 de moyenne puis chute à 15 puis stagnation à 15) +gamma = 0.95 + +#Initialisation des états +y = 0 +dist = 0 +veloc = 0 +ia = 0 +cumulated = np.zeros((nb_games))+5 +# +## Read from file +f_myfile = open('Q_functionArnaud.pickle', 'rb') +Qql = pickle.load(f_myfile) +f_myfile.close() + + +def FlappyPolicy(state, screen): + + y=int((state['player_y']-state['next_pipe_bottom_y']+300)/40) + dist = int(state['next_pipe_dist_to_player']/40) + veloc = int((state['player_vel']+16)/3) + a = np.argmax(Qql[y,dist,veloc,:]) + + if (a == 0) : + a=None + else: + a=119 + + return a +##ENTRAINEMENT +#train = True pour lancer l'entrainement +train = False +if train: + game = FlappyBird() + p = PLE(game,fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True) + p.init() + + for i in range(0,nb_games): + p.reset_game() + state = game.getGameState() + screen = p.getScreenRGB() + y=int((state['player_y']-state['next_pipe_bottom_y']+300)/40) + dist = int(state['next_pipe_dist_to_player']/40) + veloc = int((state['player_vel']+16)/3) + last9 = np.zeros((10,4)) + while(not p.game_over()): + + + if((i+1)%100==0): + epsilon = epsilon/2 + if ((i+1)>300): + epsilon = 0 + + a = epsilon_greedy(Qql,y,dist,veloc,epsilon) + if (a == None): + ia = 0 + else: + ia = 1 + + action=a ### Your job is to define this function. + rewGame = p.act(action) + state = game.getGameState() + screen = p.getScreenRGB() + ##Update de l'etat + yprec = y + distprec = dist + velocprec = veloc + y=int((state['player_y']-state['next_pipe_bottom_y']+300)/40) + dist = int(state['next_pipe_dist_to_player']/40) + veloc = int((state['player_vel']+16)/3) + + + r=1 + updateLast9(last9,y,dist,veloc,ia) + Qql[yprec][distprec][velocprec][ia] = Qql[yprec][distprec][velocprec][ia] + alpha * (r+gamma*np.max(Qql[y][dist][veloc][:])-Qql[yprec][distprec][velocprec][ia]) + + + cumulated[i] = cumulated[i] + rewGame + + for l in range(0,8): + r=-1000 + yprec = int(last9[l][0]) + y = int(last9[l+1][0]) + distprec = int(last9[l][1]) + dist = int(last9[l+1][1]) + velocprec = int(last9[l][2]) + veloc = int(last9[l+1][2]) + ia = int(last9[l+1][3]) + Qql[yprec][distprec][velocprec][ia] = Qql[yprec][distprec][velocprec][ia] + alpha * (r+gamma*np.max(Qql[y][dist][veloc][:])-Qql[yprec][distprec][velocprec][ia]) + + if (i<100): + print("i = ",i," - cumulated[i] = ",cumulated[i]," - mean = ",np.mean(cumulated[0:i])) + else: + print("i = ",i," - cumulated[i] = ",cumulated[i]," - mean[-1000] = ",np.mean(cumulated[i-100:i])) + +average_score = np.mean(cumulated[20000:24055]) +max_score = np.max(cumulated) +# +#print("saving model") +#f_myfile = open('Q_functionArnaud.pickle', 'wb') +#pickle.dump(Qql, f_myfile) +#f_myfile.close() + diff --git a/THOMAS/Q_functionArnaud.pickle b/THOMAS/Q_functionArnaud.pickle new file mode 100644 index 0000000..7d4408d Binary files /dev/null and b/THOMAS/Q_functionArnaud.pickle differ diff --git a/THOMAS/__pycache__/FlappyAgent.cpython-36.pyc b/THOMAS/__pycache__/FlappyAgent.cpython-36.pyc new file mode 100644 index 0000000..61dd5e2 Binary files /dev/null and b/THOMAS/__pycache__/FlappyAgent.cpython-36.pyc differ diff --git a/THOMAS/run.py b/THOMAS/run.py new file mode 100644 index 0000000..49f21b8 --- /dev/null +++ b/THOMAS/run.py @@ -0,0 +1,29 @@ +# You're not allowed to change this file +from ple.games.flappybird import FlappyBird +from ple import PLE +import numpy as np +from FlappyAgent import FlappyPolicy + +game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. +p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True) +# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. + +p.init() +reward = 0.0 + +nb_games = 100 +cumulated = np.zeros((nb_games)) + +for i in range(nb_games): + p.reset_game() + + while(not p.game_over()): + state = game.getGameState() + screen = p.getScreenRGB() + action=FlappyPolicy(state, screen) ### Your job is to define this function. + + reward = p.act(action) + cumulated[i] = cumulated[i] + reward + +average_score = np.mean(cumulated) +max_score = np.max(cumulated) diff --git a/THOMAS/sanstitre1.py b/THOMAS/sanstitre1.py new file mode 100644 index 0000000..898e5d0 --- /dev/null +++ b/THOMAS/sanstitre1.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Mar 11 23:04:31 2018 + +@author: Arnaud +""" +import pickle +f_myfile = open('Q_function_5000_ite.pickle', 'rb') +Qql = pickle.load(f_myfile) +f_myfile.close() +